315 files changed, 25892 insertions, 14144 deletions
diff --git a/test/CodeGen/2004-03-07-ExternalConstant.c b/test/CodeGen/2004-03-07-ExternalConstant.c
index 2de3a69bd679..36371c073d8d 100644
--- a/test/CodeGen/2004-03-07-ExternalConstant.c
+++ b/test/CodeGen/2004-03-07-ExternalConstant.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1  %s -emit-llvm -o - | FileCheck %s
 
-// CHECK: @a = external constan
+// CHECK: @a = external {{(dso_local )?}}constan
 extern const int a[];   // 'a' should be marked constant even though it's external!
 int foo () {
   return a[0];
diff --git a/test/CodeGen/2007-06-18-SextAttrAggregate.c b/test/CodeGen/2007-06-18-SextAttrAggregate.c
index 23c211d3ba04..e781bd567374 100644
--- a/test/CodeGen/2007-06-18-SextAttrAggregate.c
+++ b/test/CodeGen/2007-06-18-SextAttrAggregate.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -o - -emit-llvm | FileCheck %s
-// XFAIL: aarch64, arm64, x86_64-pc-win32, x86_64-w64-mingw32
+// XFAIL: aarch64, arm64, x86_64-pc-win32, x86_64-w64-mingw32, x86_64-pc-windows-gnu
 
 // PR1513
 
diff --git a/test/CodeGen/2007-11-07-CopyAggregateAlign.c b/test/CodeGen/2007-11-07-CopyAggregateAlign.c
index 08d97702e69a..769a38a7695b 100644
--- a/test/CodeGen/2007-11-07-CopyAggregateAlign.c
+++ b/test/CodeGen/2007-11-07-CopyAggregateAlign.c
@@ -2,6 +2,6 @@
 struct A { char s, t, u, v; short a; };
 // CHECK: %a = alloca %struct.A, align 2
 // CHECK: %b = alloca %struct.A, align 2
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{.*}}, i32 2, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{.*}} align 2 {{.*}} align 2 {{.*}}, i1 false)
 
 void q() { struct A a, b; a = b; }
diff --git a/test/CodeGen/2007-11-07-ZeroAggregateAlign.c b/test/CodeGen/2007-11-07-ZeroAggregateAlign.c
index b059607ba586..2b8cfe1f8f77 100644
--- a/test/CodeGen/2007-11-07-ZeroAggregateAlign.c
+++ b/test/CodeGen/2007-11-07-ZeroAggregateAlign.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
 struct A { short s; short t; int i; };
 // CHECK: %a = alloca %struct.A, align 4
-// CHECK: call void @llvm.memset.p0i8.{{.*}}i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.{{.*}} align 4 {{.*}}, i1 false)
 void q() { struct A a = {0}; }
diff --git a/test/CodeGen/2008-07-21-mixed-var-fn-decl.c b/test/CodeGen/2008-07-21-mixed-var-fn-decl.c
index ac132604399e..3c9591e2945e 100644
--- a/test/CodeGen/2008-07-21-mixed-var-fn-decl.c
+++ b/test/CodeGen/2008-07-21-mixed-var-fn-decl.c
@@ -3,6 +3,6 @@
 int g0, f0();
 int f1(), g1;
 
-// CHECK: @g0 = common global i32 0, align 4
-// CHECK: @g1 = common global i32 0, align 4
+// CHECK: @g0 = common {{(dso_local )?}}global i32 0, align 4
+// CHECK: @g1 = common {{(dso_local )?}}global i32 0, align 4
 
diff --git a/test/CodeGen/64bit-swiftcall.c b/test/CodeGen/64bit-swiftcall.c
index 92ba37cd7fe6..7486e44406ed 100644
--- a/test/CodeGen/64bit-swiftcall.c
+++ b/test/CodeGen/64bit-swiftcall.c
@@ -108,9 +108,7 @@ typedef struct {
 TEST(struct_1);
 // CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_1() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT1:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT1]], align 4
 // CHECK:   call void @llvm.memset
-// CHECK:   call void @llvm.memcpy
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* %retval to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -158,12 +156,8 @@ typedef struct {
 TEST(struct_2);
 // CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_2() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[RET]]
 // CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTVAR]], {{.*}}[[STRUCT2_RESULT]]
-// CHECK:   [[CASTRET:%.*]] = bitcast {{.*}} [[RET]]
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
-// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTRET]], {{.*}}[[CASTVAR]]
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[RET]] to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -214,12 +208,8 @@ typedef struct {
 TEST(struct_misaligned_1)
 // CHECK-LABEL: define swiftcc i64 @return_struct_misaligned_1()
 // CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
-// CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
-// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
-// CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
+// CHECK:  call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
@@ -267,12 +257,8 @@ typedef union {
 TEST(union_het_fp)
 // CHECK-LABEL: define swiftcc i64 @return_union_het_fp()
 // CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
-// CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
-// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CAST]]
-// CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align 8 [[CAST]]
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8
@@ -1032,3 +1018,12 @@ typedef union {
 TEST(union_hom_fp_partial2)
 // X86-64-LABEL: take_union_hom_fp_partial2(i64, float)
 // ARM64-LABEL: take_union_hom_fp_partial2(i64, float)
+
+// At one point, we emitted lifetime.ends without a matching lifetime.start for
+// CoerceAndExpanded args. Since we're not performing optimizations, neither
+// intrinsic should be emitted.
+// CHECK-LABEL: define void @no_lifetime_markers
+void no_lifetime_markers() {
+  // CHECK-NOT: call void @llvm.lifetime.
+  take_int5(return_int5());
+}
diff --git a/test/CodeGen/Atomics.c b/test/CodeGen/Atomics.c
index f957883575de..d960ac6df2f9 100644
--- a/test/CodeGen/Atomics.c
+++ b/test/CodeGen/Atomics.c
@@ -291,3 +291,10 @@ void test_lock (void)
   __sync_lock_release (&sll); // CHECK: store atomic {{.*}} release, align 8
   __sync_lock_release (&ull); // CHECK: store atomic {{.*}} release, align 8
 }
+
+void test_atomic(void) {
+  ui = __atomic_fetch_min(&ui, 5, __ATOMIC_RELAXED); // CHECK: atomicrmw umin {{.*}} monotonic
+  si = __atomic_fetch_min(&si, 5, __ATOMIC_SEQ_CST); // CHECK: atomicrmw min {{.*}} seq_cst
+  ui = __atomic_fetch_max(&ui, 5, __ATOMIC_ACQUIRE); // CHECK: atomicrmw umax {{.*}} acquire
+  si = __atomic_fetch_max(&si, 5, __ATOMIC_RELEASE); // CHECK: atomicrmw max {{.*}} release
+}
diff --git a/test/CodeGen/Inputs/debug-info-embed-source.c b/test/CodeGen/Inputs/debug-info-embed-source.c
new file mode 100644
index 000000000000..2fb55eed6b26
--- /dev/null
+++ b/test/CodeGen/Inputs/debug-info-embed-source.c
@@ -0,0 +1 @@
+void foo() { }
diff --git a/test/CodeGen/Inputs/thinlto-distributed-backend-skip.bc b/test/CodeGen/Inputs/thinlto-distributed-backend-skip.bc
new file mode 100644
index 000000000000..0243eb696055
--- /dev/null
+++ b/test/CodeGen/Inputs/thinlto-distributed-backend-skip.bc
diff --git a/test/CodeGen/aarch64-inline-asm.c b/test/CodeGen/aarch64-inline-asm.c
index a1078f1bab83..0889a7157f0b 100644
--- a/test/CodeGen/aarch64-inline-asm.c
+++ b/test/CodeGen/aarch64-inline-asm.c
@@ -44,9 +44,9 @@ void test_constraints_immed(void) {
 
 void test_constraint_S(void) {
     int *addr;
-    asm("adrp %0, %A1\n\t"
-        "add %0, %0, %L1" : "=r"(addr) : "S"(&var));
-// CHECK: call i32* asm "adrp $0, ${1:A}\0A\09add $0, $0, ${1:L}", "=r,S"(i64* @var)
+    asm("adrp %0, %1\n\t"
+        "add %0, %0, :lo12:%1" : "=r"(addr) : "S"(&var));
+// CHECK: call i32* asm "adrp $0, $1\0A\09add $0, $0, :lo12:$1", "=r,S"(i64* @var)
 }
 
 void test_constraint_Q(void) {
@@ -54,3 +54,23 @@ void test_constraint_Q(void) {
     asm("ldxr %0, %1" : "=r"(val) : "Q"(var));
 // CHECK: call i32 asm "ldxr $0, $1", "=r,*Q"(i64* @var)
 }
+
+void test_gcc_registers(void) {
+    register unsigned long reg0 asm("r0") = 0;
+    register unsigned long reg1 asm("r1") = 1;
+    register unsigned int  reg29 asm("r29") = 2;
+    register unsigned int  reg30 asm("r30") = 3;
+
+    // Test remapping register names in register ... asm("rN") statments.
+    // rN register operands in these two inline assembly lines
+    // should get renamed to valid AArch64 registers.
+    asm volatile("hvc #0" : : "r" (reg0), "r" (reg1));
+    // CHECK: call void asm sideeffect "hvc #0", "{x0},{x1}"
+    asm volatile("hvc #0" : : "r" (reg29), "r" (reg30));
+    // CHECK: call void asm sideeffect "hvc #0", "{fp},{lr}"
+
+    // rN registers when used without register ... asm("rN") syntax
+    // should not be remapped.
+    asm volatile("mov r0, r1\n");
+    // CHECK: call void asm sideeffect "mov r0, r1\0A", ""()
+}
diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c
index 3867b01afb7a..320ce0286bed 100644
--- a/test/CodeGen/aarch64-neon-2velem.c
+++ b/test/CodeGen/aarch64-neon-2velem.c
@@ -3083,6 +3083,17 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfma_n_f32(a, b, n);
 }
 
+// CHECK-LABEL: @test_vfma_n_f64(
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> [[VECINIT_I]], <1 x double> %a)
+// CHECK:   ret <1 x double> [[TMP3]]
+float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
+  return vfma_n_f64(a, b, n);
+}
+
 // CHECK-LABEL: @test_vfmaq_n_f32(
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
@@ -3110,6 +3121,18 @@ float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfms_n_f32(a, b, n);
 }
 
+// CHECK-LABEL: @test_vfms_n_f64(
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> [[VECINIT_I]], <1 x double> %a)
+// CHECK:   ret <1 x double> [[TMP3]]
+float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
+  return vfms_n_f64(a, b, n);
+}
+
 // CHECK-LABEL: @test_vfmsq_n_f32(
 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
diff --git a/test/CodeGen/aarch64-neon-dot-product.c b/test/CodeGen/aarch64-neon-dot-product.c
new file mode 100644
index 000000000000..de18db5655c1
--- /dev/null
+++ b/test/CodeGen/aarch64-neon-dot-product.c
@@ -0,0 +1,117 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +dotprod \
+// RUN: -disable-O0-optnone  -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+// Test AArch64 Armv8.2-A dot product intrinsics
+
+#include <arm_neon.h>
+
+uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_u32(a, b, c);
+}
+
+uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_u32(a, b, c);
+}
+
+int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_s32(a, b, c);
+}
+
+int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_s32(a, b, c);
+}
+
+uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_u32(a, b, c, 1);
+}
+
+uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_u32(a, b, c, 1);
+}
+
+uint32x2_t test_vdot_laneq_u32(uint32x2_t a, uint8x8_t b, uint8x16_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_laneq_u32(a, b, c, 1);
+}
+
+uint32x4_t test_vdotq_laneq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_laneq_u32(a, b, c, 1);
+}
+
+int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_s32(a, b, c, 1);
+}
+
+int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_s32(a, b, c, 1);
+}
+
+int32x2_t test_vdot_laneq_s32(int32x2_t a, int8x8_t b, int8x16_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_laneq_s32(a, b, c, 1);
+}
+
+int32x4_t test_vdotq_laneq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_laneq_s32(a, b, c, 1);
+}
+
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
index cbc2e72fcbac..40e39912be9a 100644
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -9203,7 +9203,7 @@ poly16x4_t test_vld1_p16(poly16_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
 uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
@@ -9221,7 +9221,7 @@ uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
@@ -9239,7 +9239,7 @@ uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
@@ -9257,7 +9257,7 @@ uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
@@ -9274,7 +9274,7 @@ uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x2_t [[TMP5]]
 int8x16x2_t test_vld2q_s8(int8_t const *a) {
@@ -9292,7 +9292,7 @@ int8x16x2_t test_vld2q_s8(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_s16(int16_t const *a) {
@@ -9310,7 +9310,7 @@ int16x8x2_t test_vld2q_s16(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_s32(int32_t const *a) {
@@ -9328,7 +9328,7 @@ int32x4x2_t test_vld2q_s32(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_s64(int64_t const *a) {
@@ -9346,7 +9346,7 @@ int64x2x2_t test_vld2q_s64(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_f16(float16_t const *a) {
@@ -9364,7 +9364,7 @@ float16x8x2_t test_vld2q_f16(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_f32(float32_t const *a) {
@@ -9382,7 +9382,7 @@ float32x4x2_t test_vld2q_f32(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_f64(float64_t const *a) {
@@ -9399,7 +9399,7 @@ float64x2x2_t test_vld2q_f64(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
 poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
@@ -9417,7 +9417,7 @@ poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
@@ -9434,7 +9434,7 @@ poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
 uint8x8x2_t test_vld2_u8(uint8_t const *a) {
@@ -9452,7 +9452,7 @@ uint8x8x2_t test_vld2_u8(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_u16(uint16_t const *a) {
@@ -9470,7 +9470,7 @@ uint16x4x2_t test_vld2_u16(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_u32(uint32_t const *a) {
@@ -9488,7 +9488,7 @@ uint32x2x2_t test_vld2_u32(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_u64(uint64_t const *a) {
@@ -9505,7 +9505,7 @@ uint64x1x2_t test_vld2_u64(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x2_t [[TMP5]]
 int8x8x2_t test_vld2_s8(int8_t const *a) {
@@ -9523,7 +9523,7 @@ int8x8x2_t test_vld2_s8(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_s16(int16_t const *a) {
@@ -9541,7 +9541,7 @@ int16x4x2_t test_vld2_s16(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_s32(int32_t const *a) {
@@ -9559,7 +9559,7 @@ int32x2x2_t test_vld2_s32(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_s64(int64_t const *a) {
@@ -9577,7 +9577,7 @@ int64x1x2_t test_vld2_s64(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_f16(float16_t const *a) {
@@ -9595,7 +9595,7 @@ float16x4x2_t test_vld2_f16(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_f32(float32_t const *a) {
@@ -9613,7 +9613,7 @@ float32x2x2_t test_vld2_f32(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_f64(float64_t const *a) {
@@ -9630,7 +9630,7 @@ float64x1x2_t test_vld2_f64(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
 poly8x8x2_t test_vld2_p8(poly8_t const *a) {
@@ -9648,7 +9648,7 @@ poly8x8x2_t test_vld2_p8(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_p16(poly16_t const *a) {
@@ -9665,7 +9665,7 @@ poly16x4x2_t test_vld2_p16(poly16_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
 uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
@@ -9683,7 +9683,7 @@ uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
@@ -9701,7 +9701,7 @@ uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
@@ -9719,7 +9719,7 @@ uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
@@ -9736,7 +9736,7 @@ uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x3_t [[TMP5]]
 int8x16x3_t test_vld3q_s8(int8_t const *a) {
@@ -9754,7 +9754,7 @@ int8x16x3_t test_vld3q_s8(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_s16(int16_t const *a) {
@@ -9772,7 +9772,7 @@ int16x8x3_t test_vld3q_s16(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_s32(int32_t const *a) {
@@ -9790,7 +9790,7 @@ int32x4x3_t test_vld3q_s32(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_s64(int64_t const *a) {
@@ -9808,7 +9808,7 @@ int64x2x3_t test_vld3q_s64(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_f16(float16_t const *a) {
@@ -9826,7 +9826,7 @@ float16x8x3_t test_vld3q_f16(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_f32(float32_t const *a) {
@@ -9844,7 +9844,7 @@ float32x4x3_t test_vld3q_f32(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_f64(float64_t const *a) {
@@ -9861,7 +9861,7 @@ float64x2x3_t test_vld3q_f64(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
 poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
@@ -9879,7 +9879,7 @@ poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
@@ -9896,7 +9896,7 @@ poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
 uint8x8x3_t test_vld3_u8(uint8_t const *a) {
@@ -9914,7 +9914,7 @@ uint8x8x3_t test_vld3_u8(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_u16(uint16_t const *a) {
@@ -9932,7 +9932,7 @@ uint16x4x3_t test_vld3_u16(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_u32(uint32_t const *a) {
@@ -9950,7 +9950,7 @@ uint32x2x3_t test_vld3_u32(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_u64(uint64_t const *a) {
@@ -9967,7 +9967,7 @@ uint64x1x3_t test_vld3_u64(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x3_t [[TMP5]]
 int8x8x3_t test_vld3_s8(int8_t const *a) {
@@ -9985,7 +9985,7 @@ int8x8x3_t test_vld3_s8(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_s16(int16_t const *a) {
@@ -10003,7 +10003,7 @@ int16x4x3_t test_vld3_s16(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_s32(int32_t const *a) {
@@ -10021,7 +10021,7 @@ int32x2x3_t test_vld3_s32(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_s64(int64_t const *a) {
@@ -10039,7 +10039,7 @@ int64x1x3_t test_vld3_s64(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_f16(float16_t const *a) {
@@ -10057,7 +10057,7 @@ float16x4x3_t test_vld3_f16(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_f32(float32_t const *a) {
@@ -10075,7 +10075,7 @@ float32x2x3_t test_vld3_f32(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_f64(float64_t const *a) {
@@ -10092,7 +10092,7 @@ float64x1x3_t test_vld3_f64(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
 poly8x8x3_t test_vld3_p8(poly8_t const *a) {
@@ -10110,7 +10110,7 @@ poly8x8x3_t test_vld3_p8(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_p16(poly16_t const *a) {
@@ -10127,7 +10127,7 @@ poly16x4x3_t test_vld3_p16(poly16_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
 uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
@@ -10145,7 +10145,7 @@ uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
@@ -10163,7 +10163,7 @@ uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
@@ -10181,7 +10181,7 @@ uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
@@ -10198,7 +10198,7 @@ uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x4_t [[TMP5]]
 int8x16x4_t test_vld4q_s8(int8_t const *a) {
@@ -10216,7 +10216,7 @@ int8x16x4_t test_vld4q_s8(int8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_s16(int16_t const *a) {
@@ -10234,7 +10234,7 @@ int16x8x4_t test_vld4q_s16(int16_t const *a) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_s32(int32_t const *a) {
@@ -10252,7 +10252,7 @@ int32x4x4_t test_vld4q_s32(int32_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_s64(int64_t const *a) {
@@ -10270,7 +10270,7 @@ int64x2x4_t test_vld4q_s64(int64_t const *a) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_f16(float16_t const *a) {
@@ -10288,7 +10288,7 @@ float16x8x4_t test_vld4q_f16(float16_t const *a) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_f32(float32_t const *a) {
@@ -10306,7 +10306,7 @@ float32x4x4_t test_vld4q_f32(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_f64(float64_t const *a) {
@@ -10323,7 +10323,7 @@ float64x2x4_t test_vld4q_f64(float64_t const *a) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
 poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
@@ -10341,7 +10341,7 @@ poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
@@ -10358,7 +10358,7 @@ poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
 uint8x8x4_t test_vld4_u8(uint8_t const *a) {
@@ -10376,7 +10376,7 @@ uint8x8x4_t test_vld4_u8(uint8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_u16(uint16_t const *a) {
@@ -10394,7 +10394,7 @@ uint16x4x4_t test_vld4_u16(uint16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_u32(uint32_t const *a) {
@@ -10412,7 +10412,7 @@ uint32x2x4_t test_vld4_u32(uint32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_u64(uint64_t const *a) {
@@ -10429,7 +10429,7 @@ uint64x1x4_t test_vld4_u64(uint64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x4_t [[TMP5]]
 int8x8x4_t test_vld4_s8(int8_t const *a) {
@@ -10447,7 +10447,7 @@ int8x8x4_t test_vld4_s8(int8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_s16(int16_t const *a) {
@@ -10465,7 +10465,7 @@ int16x4x4_t test_vld4_s16(int16_t const *a) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_s32(int32_t const *a) {
@@ -10483,7 +10483,7 @@ int32x2x4_t test_vld4_s32(int32_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_s64(int64_t const *a) {
@@ -10501,7 +10501,7 @@ int64x1x4_t test_vld4_s64(int64_t const *a) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_f16(float16_t const *a) {
@@ -10519,7 +10519,7 @@ float16x4x4_t test_vld4_f16(float16_t const *a) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_f32(float32_t const *a) {
@@ -10537,7 +10537,7 @@ float32x2x4_t test_vld4_f32(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_f64(float64_t const *a) {
@@ -10554,7 +10554,7 @@ float64x1x4_t test_vld4_f64(float64_t const *a) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
 poly8x8x4_t test_vld4_p8(poly8_t const *a) {
@@ -10572,7 +10572,7 @@ poly8x8x4_t test_vld4_p8(poly8_t const *a) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_p16(poly16_t const *a) {
@@ -10854,7 +10854,7 @@ void test_vst1_p16(poly16_t *a, poly16x4_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -10874,7 +10874,7 @@ void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -10899,7 +10899,7 @@ void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -10924,7 +10924,7 @@ void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -10949,7 +10949,7 @@ void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -10969,7 +10969,7 @@ void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -10994,7 +10994,7 @@ void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -11019,7 +11019,7 @@ void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -11044,7 +11044,7 @@ void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -11069,7 +11069,7 @@ void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -11094,7 +11094,7 @@ void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -11119,7 +11119,7 @@ void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11139,7 +11139,7 @@ void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11164,7 +11164,7 @@ void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11184,7 +11184,7 @@ void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11209,7 +11209,7 @@ void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -11234,7 +11234,7 @@ void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -11259,7 +11259,7 @@ void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11279,7 +11279,7 @@ void test_vst2_s8(int8_t *a, int8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11304,7 +11304,7 @@ void test_vst2_s16(int16_t *a, int16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -11329,7 +11329,7 @@ void test_vst2_s32(int32_t *a, int32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -11354,7 +11354,7 @@ void test_vst2_s64(int64_t *a, int64x1x2_t b) {
 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -11379,7 +11379,7 @@ void test_vst2_f16(float16_t *a, float16x4x2_t b) {
 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -11404,7 +11404,7 @@ void test_vst2_f32(float32_t *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -11429,7 +11429,7 @@ void test_vst2_f64(float64_t *a, float64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11449,7 +11449,7 @@ void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11474,7 +11474,7 @@ void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11497,7 +11497,7 @@ void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11527,7 +11527,7 @@ void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -11557,7 +11557,7 @@ void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -11587,7 +11587,7 @@ void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11610,7 +11610,7 @@ void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11640,7 +11640,7 @@ void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -11670,7 +11670,7 @@ void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -11700,7 +11700,7 @@ void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -11730,7 +11730,7 @@ void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -11760,7 +11760,7 @@ void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -11790,7 +11790,7 @@ void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -11813,7 +11813,7 @@ void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -11843,7 +11843,7 @@ void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11866,7 +11866,7 @@ void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -11896,7 +11896,7 @@ void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -11926,7 +11926,7 @@ void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -11956,7 +11956,7 @@ void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -11979,7 +11979,7 @@ void test_vst3_s8(int8_t *a, int8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12009,7 +12009,7 @@ void test_vst3_s16(int16_t *a, int16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -12039,7 +12039,7 @@ void test_vst3_s32(int32_t *a, int32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -12069,7 +12069,7 @@ void test_vst3_s64(int64_t *a, int64x1x3_t b) {
 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -12099,7 +12099,7 @@ void test_vst3_f16(float16_t *a, float16x4x3_t b) {
 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -12129,7 +12129,7 @@ void test_vst3_f32(float32_t *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -12159,7 +12159,7 @@ void test_vst3_f64(float64_t *a, float64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -12182,7 +12182,7 @@ void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12212,7 +12212,7 @@ void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -12238,7 +12238,7 @@ void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -12273,7 +12273,7 @@ void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -12308,7 +12308,7 @@ void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -12343,7 +12343,7 @@ void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -12369,7 +12369,7 @@ void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -12404,7 +12404,7 @@ void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -12439,7 +12439,7 @@ void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -12474,7 +12474,7 @@ void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -12509,7 +12509,7 @@ void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -12544,7 +12544,7 @@ void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -12579,7 +12579,7 @@ void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -12605,7 +12605,7 @@ void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -12640,7 +12640,7 @@ void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -12666,7 +12666,7 @@ void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12701,7 +12701,7 @@ void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -12736,7 +12736,7 @@ void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -12771,7 +12771,7 @@ void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -12797,7 +12797,7 @@ void test_vst4_s8(int8_t *a, int8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -12832,7 +12832,7 @@ void test_vst4_s16(int16_t *a, int16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -12867,7 +12867,7 @@ void test_vst4_s32(int32_t *a, int32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -12902,7 +12902,7 @@ void test_vst4_s64(int64_t *a, int64x1x4_t b) {
 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -12937,7 +12937,7 @@ void test_vst4_f16(float16_t *a, float16x4x4_t b) {
 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -12972,7 +12972,7 @@ void test_vst4_f32(float32_t *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -13007,7 +13007,7 @@ void test_vst4_f64(float64_t *a, float64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -13033,7 +13033,7 @@ void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -13061,182 +13061,6 @@ void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vld1q_u8_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
-uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
-  return vld1q_u8_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
-uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
-  return vld1q_u16_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u32_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
-uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
-  return vld1q_u32_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
-uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
-  return vld1q_u64_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s8_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
-int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
-  return vld1q_s8_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
-int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
-  return vld1q_s16_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s32_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
-int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
-  return vld1q_s32_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
-int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
-  return vld1q_s64_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_f16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
-float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
-  return vld1q_f16_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_f32_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
-float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
-  return vld1q_f32_x2(a);
-}
-
 // CHECK-LABEL: @test_vld1q_f64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
@@ -13248,47 +13072,13 @@ float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
   return vld1q_f64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p8_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
-poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
-  return vld1q_p8_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1q_p16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
-poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
-  return vld1q_p16_x2(a);
-}
-
 // CHECK-LABEL: @test_vld1q_p64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
@@ -13300,189 +13090,13 @@ poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
   return vld1q_p64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1_u8_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
-uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
-  return vld1_u8_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_u16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
-uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
-  return vld1_u16_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_u32_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
-uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
-  return vld1_u32_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_u64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
-uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
-  return vld1_u64_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_s8_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
-int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
-  return vld1_s8_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_s16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
-int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
-  return vld1_s16_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_s32_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
-int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
-  return vld1_s32_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_s64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
-int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
-  return vld1_s64_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_f16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
-float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
-  return vld1_f16_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_f32_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
-float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
-  return vld1_f32_x2(a);
-}
-
 // CHECK-LABEL: @test_vld1_f64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
@@ -13494,47 +13108,13 @@ float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
   return vld1_f64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1_p8_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
-poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
-  return vld1_p8_x2(a);
-}
-
-// CHECK-LABEL: @test_vld1_p16_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
-poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
-  return vld1_p16_x2(a);
-}
-
 // CHECK-LABEL: @test_vld1_p64_x2(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
@@ -13546,189 +13126,13 @@ poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
   return vld1_p64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u8_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
-uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
-  return vld1q_u8_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
-uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
-  return vld1q_u16_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u32_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
-uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
-  return vld1q_u32_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
-uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
-  return vld1q_u64_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s8_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
-int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
-  return vld1q_s8_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
-int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
-  return vld1q_s16_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s32_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
-int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
-  return vld1q_s32_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
-int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
-  return vld1q_s64_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_f16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
-float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
-  return vld1q_f16_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_f32_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
-float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
-  return vld1q_f32_x3(a);
-}
-
 // CHECK-LABEL: @test_vld1q_f64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
@@ -13740,47 +13144,13 @@ float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
   return vld1q_f64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p8_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
-poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
-  return vld1q_p8_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1q_p16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
-poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
-  return vld1q_p16_x3(a);
-}
-
 // CHECK-LABEL: @test_vld1q_p64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
@@ -13792,189 +13162,13 @@ poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
   return vld1q_p64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1_u8_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
-uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
-  return vld1_u8_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_u16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
-uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
-  return vld1_u16_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_u32_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
-uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
-  return vld1_u32_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_u64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
-uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
-  return vld1_u64_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_s8_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
-int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
-  return vld1_s8_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_s16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
-int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
-  return vld1_s16_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_s32_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
-int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
-  return vld1_s32_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_s64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
-int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
-  return vld1_s64_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_f16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
-float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
-  return vld1_f16_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_f32_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
-float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
-  return vld1_f32_x3(a);
-}
-
 // CHECK-LABEL: @test_vld1_f64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
@@ -13986,47 +13180,13 @@ float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
   return vld1_f64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1_p8_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
-poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
-  return vld1_p8_x3(a);
-}
-
-// CHECK-LABEL: @test_vld1_p16_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
-poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
-  return vld1_p16_x3(a);
-}
-
 // CHECK-LABEL: @test_vld1_p64_x3(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
@@ -14038,189 +13198,13 @@ poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
   return vld1_p64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u8_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
-uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
-  return vld1q_u8_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
-uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
-  return vld1q_u16_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u32_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
-uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
-  return vld1q_u32_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_u64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
-uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
-  return vld1q_u64_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s8_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
-int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
-  return vld1q_s8_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
-int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
-  return vld1q_s16_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s32_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
-int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
-  return vld1q_s32_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_s64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
-int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
-  return vld1q_s64_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_f16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
-float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
-  return vld1q_f16_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_f32_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
-float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
-  return vld1q_f32_x4(a);
-}
-
 // CHECK-LABEL: @test_vld1q_f64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
@@ -14232,47 +13216,13 @@ float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
   return vld1q_f64_x4(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p8_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
-poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
-  return vld1q_p8_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1q_p16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
-poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
-  return vld1q_p16_x4(a);
-}
-
 // CHECK-LABEL: @test_vld1q_p64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
@@ -14284,189 +13234,13 @@ poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
   return vld1q_p64_x4(a);
 }
 
-// CHECK-LABEL: @test_vld1_u8_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
-uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
-  return vld1_u8_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_u16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
-uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
-  return vld1_u16_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_u32_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
-uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
-  return vld1_u32_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_u64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
-uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
-  return vld1_u64_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_s8_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
-int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
-  return vld1_s8_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_s16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
-int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
-  return vld1_s16_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_s32_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
-int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
-  return vld1_s32_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_s64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
-int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
-  return vld1_s64_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_f16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
-float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
-  return vld1_f16_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_f32_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
-float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
-  return vld1_f32_x4(a);
-}
-
 // CHECK-LABEL: @test_vld1_f64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
@@ -14478,47 +13252,13 @@ float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
   return vld1_f64_x4(a);
 }
 
-// CHECK-LABEL: @test_vld1_p8_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
-poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
-  return vld1_p8_x4(a);
-}
-
-// CHECK-LABEL: @test_vld1_p16_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
-poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
-  return vld1_p16_x4(a);
-}
-
 // CHECK-LABEL: @test_vld1_p64_x4(
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
@@ -14530,261 +13270,13 @@ poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
   return vld1_p64_x4(a);
 }
 
-// CHECK-LABEL: @test_vst1q_u8_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
-  vst1q_u8_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
-  vst1q_u16_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u32_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
-  vst1q_u32_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
-  vst1q_u64_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s8_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
-  vst1q_s8_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
-  vst1q_s16_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s32_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
-  vst1q_s32_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
-  vst1q_s64_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_f16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8f16.p0f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], half* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
-  vst1q_f16_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_f32_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
-  vst1q_f32_x2(a, b);
-}
-
 // CHECK-LABEL: @test_vst1q_f64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
@@ -14792,7 +13284,7 @@ void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -14811,52 +13303,6 @@ void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
   vst1q_f64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p8_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
-  vst1q_p8_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_p16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
-// CHECK:   ret void
-void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
-  vst1q_p16_x2(a, b);
-}
-
 // CHECK-LABEL: @test_vst1q_p64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
@@ -14864,7 +13310,7 @@ void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -14883,254 +13329,6 @@ void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
   vst1q_p64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u8_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
-// CHECK:   ret void
-void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
-  vst1_u8_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
-  vst1_u16_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u32_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
-  vst1_u32_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
-  vst1_u64_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s8_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
-// CHECK:   ret void
-void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
-  vst1_s8_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
-  vst1_s16_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s32_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
-  vst1_s32_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
-  vst1_s64_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_f16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4f16.p0f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], half* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
-  vst1_f16_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_f32_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
-  vst1_f32_x2(a, b);
-}
-
 // CHECK-LABEL: @test_vst1_f64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
@@ -15138,7 +13336,7 @@ void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -15157,52 +13355,6 @@ void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
   vst1_f64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p8_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
-// CHECK:   ret void
-void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
-  vst1_p8_x2(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_p16_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
-// CHECK:   ret void
-void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
-  vst1_p16_x2(a, b);
-}
-
 // CHECK-LABEL: @test_vst1_p64_x2(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
@@ -15210,7 +13362,7 @@ void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -15229,300 +13381,6 @@ void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
   vst1_p64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u8_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
-  vst1q_u8_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
-  vst1q_u16_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u32_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
-  vst1q_u32_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
-  vst1q_u64_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s8_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
-  vst1q_s8_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
-  vst1q_s16_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s32_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
-  vst1q_s32_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
-  vst1q_s64_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_f16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8f16.p0f16(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], half* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
-  vst1q_f16_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_f32_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], float* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
-  vst1q_f32_x3(a, b);
-}
-
 // CHECK-LABEL: @test_vst1q_f64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
@@ -15530,7 +13388,7 @@ void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -15554,60 +13412,6 @@ void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
   vst1q_f64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p8_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
-  vst1q_p8_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_p16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
-// CHECK:   ret void
-void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
-  vst1q_p16_x3(a, b);
-}
-
 // CHECK-LABEL: @test_vst1q_p64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
@@ -15615,7 +13419,7 @@ void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -15639,300 +13443,6 @@ void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
   vst1q_p64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u8_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
-// CHECK:   ret void
-void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
-  vst1_u8_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
-  vst1_u16_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u32_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
-  vst1_u32_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
-  vst1_u64_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s8_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
-// CHECK:   ret void
-void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
-  vst1_s8_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
-  vst1_s16_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s32_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
-  vst1_s32_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
-  vst1_s64_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_f16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4f16.p0f16(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], half* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
-  vst1_f16_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_f32_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], float* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
-  vst1_f32_x3(a, b);
-}
-
 // CHECK-LABEL: @test_vst1_f64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
@@ -15940,7 +13450,7 @@ void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -15964,60 +13474,6 @@ void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
   vst1_f64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p8_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
-// CHECK:   ret void
-void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
-  vst1_p8_x3(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_p16_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
-// CHECK:   ret void
-void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
-  vst1_p16_x3(a, b);
-}
-
 // CHECK-LABEL: @test_vst1_p64_x3(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
@@ -16025,7 +13481,7 @@ void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -16049,346 +13505,6 @@ void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
   vst1_p64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u8_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
-  vst1q_u8_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
-  vst1q_u16_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u32_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
-  vst1q_u32_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_u64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
-  vst1q_u64_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s8_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
-  vst1q_s8_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
-  vst1q_s16_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s32_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
-  vst1q_s32_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_s64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
-  vst1q_s64_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_f16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8f16.p0f16(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], half* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
-  vst1q_f16_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_f32_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], float* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
-  vst1q_f32_x4(a, b);
-}
-
 // CHECK-LABEL: @test_vst1q_f64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
@@ -16396,7 +13512,7 @@ void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -16425,68 +13541,6 @@ void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
   vst1q_f64_x4(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p8_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
-// CHECK:   ret void
-void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
-  vst1q_p8_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1q_p16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
-// CHECK:   ret void
-void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
-  vst1q_p16_x4(a, b);
-}
-
 // CHECK-LABEL: @test_vst1q_p64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
@@ -16494,7 +13548,7 @@ void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -16523,346 +13577,6 @@ void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
   vst1q_p64_x4(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u8_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
-// CHECK:   ret void
-void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
-  vst1_u8_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
-  vst1_u16_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u32_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
-  vst1_u32_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_u64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
-  vst1_u64_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s8_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
-// CHECK:   ret void
-void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
-  vst1_s8_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
-  vst1_s16_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s32_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
-  vst1_s32_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_s64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
-  vst1_s64_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_f16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4f16.p0f16(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], half* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
-  vst1_f16_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_f32_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], float* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
-  vst1_f32_x4(a, b);
-}
-
 // CHECK-LABEL: @test_vst1_f64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
@@ -16870,7 +13584,7 @@ void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -16899,68 +13613,6 @@ void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
   vst1_f64_x4(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p8_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
-// CHECK:   ret void
-void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
-  vst1_p8_x4(a, b);
-}
-
-// CHECK-LABEL: @test_vst1_p16_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
-// CHECK:   ret void
-void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
-  vst1_p16_x4(a, b);
-}
-
 // CHECK-LABEL: @test_vst1_p64_x4(
 // CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
 // CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
@@ -16968,7 +13620,7 @@ void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
diff --git a/test/CodeGen/aarch64-neon-ldst-one.c b/test/CodeGen/aarch64-neon-ldst-one.c
index a3c5b140a0d2..592b2ceffa6a 100644
--- a/test/CodeGen/aarch64-neon-ldst-one.c
+++ b/test/CodeGen/aarch64-neon-ldst-one.c
@@ -300,58 +300,6 @@ poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
   return vld1_dup_p64(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_dup_u8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
-uint8x16x2_t test_vld2q_dup_u8(uint8_t  *a) {
-  return vld2q_dup_u8(a);
-}
-
-// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_dup_u16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
-uint16x8x2_t test_vld2q_dup_u16(uint16_t  *a) {
-  return vld2q_dup_u16(a);
-}
-
-// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_dup_u32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
-uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
-  return vld2q_dup_u32(a);
-}
-
 // CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
@@ -363,65 +311,13 @@ uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
   return vld2q_dup_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
-int8x16x2_t test_vld2q_dup_s8(int8_t  *a) {
-  return vld2q_dup_s8(a);
-}
-
-// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
-int16x8x2_t test_vld2q_dup_s16(int16_t  *a) {
-  return vld2q_dup_s16(a);
-}
-
-// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
-int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
-  return vld2q_dup_s32(a);
-}
-
 // CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
@@ -433,49 +329,13 @@ int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
   return vld2q_dup_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_dup_f16(half* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
-float16x8x2_t test_vld2q_dup_f16(float16_t  *a) {
-  return vld2q_dup_f16(a);
-}
-
-// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
-float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
-  return vld2q_dup_f32(a);
-}
-
 // CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
@@ -487,47 +347,13 @@ float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
   return vld2q_dup_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_dup_p8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
-poly8x16x2_t test_vld2q_dup_p8(poly8_t  *a) {
-  return vld2q_dup_p8(a);
-}
-
-// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_dup_p16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
-poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
-  return vld2q_dup_p16(a);
-}
-
 // CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
@@ -539,189 +365,13 @@ poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
   return vld2q_dup_p64(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
-uint8x8x2_t test_vld2_dup_u8(uint8_t  *a) {
-  return vld2_dup_u8(a);
-}
-
-// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
-uint16x4x2_t test_vld2_dup_u16(uint16_t  *a) {
-  return vld2_dup_u16(a);
-}
-
-// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
-uint32x2x2_t test_vld2_dup_u32(uint32_t  *a) {
-  return vld2_dup_u32(a);
-}
-
-// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
-uint64x1x2_t test_vld2_dup_u64(uint64_t  *a) {
-  return vld2_dup_u64(a);
-}
-
-// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
-int8x8x2_t test_vld2_dup_s8(int8_t  *a) {
-  return vld2_dup_s8(a);
-}
-
-// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
-int16x4x2_t test_vld2_dup_s16(int16_t  *a) {
-  return vld2_dup_s16(a);
-}
-
-// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
-int32x2x2_t test_vld2_dup_s32(int32_t  *a) {
-  return vld2_dup_s32(a);
-}
-
-// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
-int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
-  return vld2_dup_s64(a);
-}
-
-// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
-float16x4x2_t test_vld2_dup_f16(float16_t  *a) {
-  return vld2_dup_f16(a);
-}
-
-// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
-float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
-  return vld2_dup_f32(a);
-}
-
 // CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
@@ -733,47 +383,13 @@ float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
   return vld2_dup_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
-poly8x8x2_t test_vld2_dup_p8(poly8_t  *a) {
-  return vld2_dup_p8(a);
-}
-
-// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
-poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
-  return vld2_dup_p16(a);
-}
-
 // CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
@@ -785,68 +401,13 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
   return vld2_dup_p64(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_dup_u8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
-uint8x16x3_t test_vld3q_dup_u8(uint8_t  *a) {
-  return vld3q_dup_u8(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_dup_u16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
-uint16x8x3_t test_vld3q_dup_u16(uint16_t  *a) {
-  return vld3q_dup_u16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_dup_u32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
-uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
-  return vld3q_dup_u32(a);
-  // [{{x[0-9]+|sp}}]
-}
-
 // CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
@@ -858,7 +419,7 @@ uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
@@ -866,61 +427,6 @@ uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
-int8x16x3_t test_vld3q_dup_s8(int8_t  *a) {
-  return vld3q_dup_s8(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
-int16x8x3_t test_vld3q_dup_s16(int16_t  *a) {
-  return vld3q_dup_s16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
-int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
-  return vld3q_dup_s32(a);
-  // [{{x[0-9]+|sp}}]
-}
-
 // CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
@@ -932,7 +438,7 @@ int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
@@ -940,44 +446,6 @@ int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_dup_f16(half* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
-float16x8x3_t test_vld3q_dup_f16(float16_t  *a) {
-  return vld3q_dup_f16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
-float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
-  return vld3q_dup_f32(a);
-  // [{{x[0-9]+|sp}}]
-}
-
 // CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
@@ -989,7 +457,7 @@ float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
@@ -997,42 +465,6 @@ float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_dup_p8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
-poly8x16x3_t test_vld3q_dup_p8(poly8_t  *a) {
-  return vld3q_dup_p8(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_dup_p16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
-poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
-  return vld3q_dup_p16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
 // CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
@@ -1044,7 +476,7 @@ poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
@@ -1052,192 +484,6 @@ poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
-uint8x8x3_t test_vld3_dup_u8(uint8_t  *a) {
-  return vld3_dup_u8(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
-uint16x4x3_t test_vld3_dup_u16(uint16_t  *a) {
-  return vld3_dup_u16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
-uint32x2x3_t test_vld3_dup_u32(uint32_t  *a) {
-  return vld3_dup_u32(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
-uint64x1x3_t test_vld3_dup_u64(uint64_t  *a) {
-  return vld3_dup_u64(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
-int8x8x3_t test_vld3_dup_s8(int8_t  *a) {
-  return vld3_dup_s8(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
-int16x4x3_t test_vld3_dup_s16(int16_t  *a) {
-  return vld3_dup_s16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
-int32x2x3_t test_vld3_dup_s32(int32_t  *a) {
-  return vld3_dup_s32(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
-int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
-  return vld3_dup_s64(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
-float16x4x3_t test_vld3_dup_f16(float16_t  *a) {
-  return vld3_dup_f16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
-float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
-  return vld3_dup_f32(a);
-  // [{{x[0-9]+|sp}}]
-}
-
 // CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
@@ -1249,7 +495,7 @@ float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
@@ -1257,42 +503,6 @@ float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
-poly8x8x3_t test_vld3_dup_p8(poly8_t  *a) {
-  return vld3_dup_p8(a);
-  // [{{x[0-9]+|sp}}]
-}
-
-// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
-poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
-  return vld3_dup_p16(a);
-  // [{{x[0-9]+|sp}}]
-}
-
 // CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
@@ -1304,7 +514,7 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
@@ -1312,58 +522,6 @@ poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_dup_u8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
-uint8x16x4_t test_vld4q_dup_u8(uint8_t  *a) {
-  return vld4q_dup_u8(a);
-}
-
-// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_dup_u16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
-uint16x8x4_t test_vld4q_dup_u16(uint16_t  *a) {
-  return vld4q_dup_u16(a);
-}
-
-// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_dup_u32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
-uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
-  return vld4q_dup_u32(a);
-}
-
 // CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
@@ -1375,65 +533,13 @@ uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
   return vld4q_dup_u64(a);
 }
 
-// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
-int8x16x4_t test_vld4q_dup_s8(int8_t  *a) {
-  return vld4q_dup_s8(a);
-}
-
-// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
-int16x8x4_t test_vld4q_dup_s16(int16_t  *a) {
-  return vld4q_dup_s16(a);
-}
-
-// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
-int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
-  return vld4q_dup_s32(a);
-}
-
 // CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
@@ -1445,49 +551,13 @@ int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
   return vld4q_dup_s64(a);
 }
 
-// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_dup_f16(half* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
-float16x8x4_t test_vld4q_dup_f16(float16_t  *a) {
-  return vld4q_dup_f16(a);
-}
-
-// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
-float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
-  return vld4q_dup_f32(a);
-}
-
 // CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
@@ -1499,47 +569,13 @@ float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
   return vld4q_dup_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_dup_p8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
-poly8x16x4_t test_vld4q_dup_p8(poly8_t  *a) {
-  return vld4q_dup_p8(a);
-}
-
-// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_dup_p16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
-poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
-  return vld4q_dup_p16(a);
-}
-
 // CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
@@ -1551,189 +587,13 @@ poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
   return vld4q_dup_p64(a);
 }
 
-// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
-uint8x8x4_t test_vld4_dup_u8(uint8_t  *a) {
-  return vld4_dup_u8(a);
-}
-
-// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
-uint16x4x4_t test_vld4_dup_u16(uint16_t  *a) {
-  return vld4_dup_u16(a);
-}
-
-// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
-uint32x2x4_t test_vld4_dup_u32(uint32_t  *a) {
-  return vld4_dup_u32(a);
-}
-
-// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
-uint64x1x4_t test_vld4_dup_u64(uint64_t  *a) {
-  return vld4_dup_u64(a);
-}
-
-// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
-int8x8x4_t test_vld4_dup_s8(int8_t  *a) {
-  return vld4_dup_s8(a);
-}
-
-// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
-int16x4x4_t test_vld4_dup_s16(int16_t  *a) {
-  return vld4_dup_s16(a);
-}
-
-// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
-int32x2x4_t test_vld4_dup_s32(int32_t  *a) {
-  return vld4_dup_s32(a);
-}
-
-// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
-int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
-  return vld4_dup_s64(a);
-}
-
-// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
-float16x4x4_t test_vld4_dup_f16(float16_t  *a) {
-  return vld4_dup_f16(a);
-}
-
-// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
-// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
-float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
-  return vld4_dup_f32(a);
-}
-
 // CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
@@ -1745,47 +605,13 @@ float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
   return vld4_dup_f64(a);
 }
 
-// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
-poly8x8x4_t test_vld4_dup_p8(poly8_t  *a) {
-  return vld4_dup_p8(a);
-}
-
-// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
-// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
-poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
-  return vld4_dup_p16(a);
-}
-
 // CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #0 {
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
 // CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
@@ -1797,7 +623,7 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
@@ -2125,7 +951,7 @@ poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2138,7 +964,7 @@ poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
@@ -2154,7 +980,7 @@ int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2167,7 +993,7 @@ int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
@@ -2183,7 +1009,7 @@ uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
 // CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2196,7 +1022,7 @@ uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
@@ -2212,7 +1038,7 @@ poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
 // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2228,7 +1054,7 @@ poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x3_t [[TMP9]]
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
@@ -2244,7 +1070,7 @@ int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
 // CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -2260,7 +1086,7 @@ int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
@@ -2276,7 +1102,7 @@ uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2294,7 +1120,7 @@ uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
@@ -2310,7 +1136,7 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
@@ -2328,7 +1154,7 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
@@ -2344,7 +1170,7 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2362,7 +1188,7 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
@@ -2378,7 +1204,7 @@ uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2396,7 +1222,7 @@ uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x2_t [[TMP13]]
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
@@ -2412,7 +1238,7 @@ int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
@@ -2430,7 +1256,7 @@ int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x2_t [[TMP13]]
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
@@ -2446,7 +1272,7 @@ int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2464,7 +1290,7 @@ int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x2_t [[TMP13]]
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
@@ -2480,7 +1306,7 @@ int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2498,7 +1324,7 @@ int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   store { <8 x half>, <8 x half> } [[VLD2_LANE]], { <8 x half>, <8 x half> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x2_t [[TMP13]]
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
@@ -2514,7 +1340,7 @@ float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
@@ -2532,7 +1358,7 @@ float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
 // CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x2_t [[TMP13]]
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
@@ -2548,7 +1374,7 @@ float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2566,7 +1392,7 @@ float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
 // CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x2_t [[TMP13]]
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
@@ -2582,7 +1408,7 @@ float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
@@ -2600,7 +1426,7 @@ float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
@@ -2616,7 +1442,7 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
@@ -2634,7 +1460,7 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
@@ -2650,7 +1476,7 @@ poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -2663,7 +1489,7 @@ poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
@@ -2679,7 +1505,7 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
@@ -2697,7 +1523,7 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
@@ -2713,7 +1539,7 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
@@ -2731,7 +1557,7 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
@@ -2747,7 +1573,7 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
@@ -2765,7 +1591,7 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
@@ -2781,7 +1607,7 @@ uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -2794,7 +1620,7 @@ uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
@@ -2810,7 +1636,7 @@ int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
@@ -2828,7 +1654,7 @@ int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x2_t [[TMP13]]
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
@@ -2844,7 +1670,7 @@ int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
@@ -2862,7 +1688,7 @@ int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x2_t [[TMP13]]
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
@@ -2878,7 +1704,7 @@ int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
@@ -2896,7 +1722,7 @@ int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x2_t [[TMP13]]
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
@@ -2912,7 +1738,7 @@ int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
@@ -2930,7 +1756,7 @@ int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   store { <4 x half>, <4 x half> } [[VLD2_LANE]], { <4 x half>, <4 x half> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x2_t [[TMP13]]
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
@@ -2946,7 +1772,7 @@ float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
@@ -2964,7 +1790,7 @@ float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
 // CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x2_t [[TMP13]]
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
@@ -2980,7 +1806,7 @@ float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
@@ -2998,7 +1824,7 @@ float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
 // CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x2_t [[TMP13]]
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
@@ -3014,7 +1840,7 @@ float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -3027,7 +1853,7 @@ float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
 // CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
@@ -3043,7 +1869,7 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
@@ -3061,7 +1887,7 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
@@ -3077,7 +1903,7 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
@@ -3095,7 +1921,7 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
 // CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
@@ -3111,7 +1937,7 @@ poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3134,7 +1960,7 @@ poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
@@ -3150,7 +1976,7 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
@@ -3173,7 +1999,7 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
@@ -3189,7 +2015,7 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3212,7 +2038,7 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
@@ -3228,7 +2054,7 @@ uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3251,7 +2077,7 @@ uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x3_t [[TMP16]]
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
@@ -3267,7 +2093,7 @@ int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
@@ -3290,7 +2116,7 @@ int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x3_t [[TMP16]]
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
@@ -3306,7 +2132,7 @@ int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3329,7 +2155,7 @@ int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x3_t [[TMP16]]
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
@@ -3345,7 +2171,7 @@ int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3368,7 +2194,7 @@ int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x3_t [[TMP16]]
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
@@ -3384,7 +2210,7 @@ float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
@@ -3407,7 +2233,7 @@ float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x3_t [[TMP16]]
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
@@ -3423,7 +2249,7 @@ float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3446,7 +2272,7 @@ float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x3_t [[TMP16]]
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
@@ -3462,7 +2288,7 @@ float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -3478,7 +2304,7 @@ float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
@@ -3494,7 +2320,7 @@ poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
@@ -3517,7 +2343,7 @@ poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
@@ -3533,7 +2359,7 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
@@ -3556,7 +2382,7 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
@@ -3572,7 +2398,7 @@ poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -3588,7 +2414,7 @@ poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
@@ -3604,7 +2430,7 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
@@ -3627,7 +2453,7 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
@@ -3643,7 +2469,7 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
@@ -3666,7 +2492,7 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
@@ -3682,7 +2508,7 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
@@ -3705,7 +2531,7 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
@@ -3721,7 +2547,7 @@ uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -3737,7 +2563,7 @@ uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
@@ -3753,7 +2579,7 @@ int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
@@ -3776,7 +2602,7 @@ int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x3_t [[TMP16]]
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
@@ -3792,7 +2618,7 @@ int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
@@ -3815,7 +2641,7 @@ int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x3_t [[TMP16]]
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
@@ -3831,7 +2657,7 @@ int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
@@ -3854,7 +2680,7 @@ int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x3_t [[TMP16]]
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
@@ -3870,7 +2696,7 @@ int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
@@ -3893,7 +2719,7 @@ int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x3_t [[TMP16]]
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
@@ -3909,7 +2735,7 @@ float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
@@ -3932,7 +2758,7 @@ float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x3_t [[TMP16]]
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
@@ -3948,7 +2774,7 @@ float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
@@ -3971,7 +2797,7 @@ float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x3_t [[TMP16]]
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
@@ -3987,7 +2813,7 @@ float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -4003,7 +2829,7 @@ float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
 // CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
 // CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
@@ -4019,7 +2845,7 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
@@ -4042,7 +2868,7 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
@@ -4058,7 +2884,7 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
@@ -4081,7 +2907,7 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
 // CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
@@ -4097,7 +2923,7 @@ poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -4116,7 +2942,7 @@ poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
@@ -4132,7 +2958,7 @@ uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4160,7 +2986,7 @@ uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
@@ -4176,7 +3002,7 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
@@ -4204,7 +3030,7 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
@@ -4220,7 +3046,7 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4248,7 +3074,7 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
@@ -4264,7 +3090,7 @@ uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -4283,7 +3109,7 @@ uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int8x16x4_t [[TMP10]]
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
@@ -4299,7 +3125,7 @@ int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4327,7 +3153,7 @@ int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int16x8x4_t [[TMP19]]
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
@@ -4343,7 +3169,7 @@ int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
@@ -4371,7 +3197,7 @@ int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
 // CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int32x4x4_t [[TMP19]]
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
@@ -4387,7 +3213,7 @@ int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4415,7 +3241,7 @@ int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.int64x2x4_t [[TMP19]]
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
@@ -4431,7 +3257,7 @@ int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4459,7 +3285,7 @@ int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float16x8x4_t [[TMP19]]
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
@@ -4475,7 +3301,7 @@ float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
@@ -4503,7 +3329,7 @@ float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
 // CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float32x4x4_t [[TMP19]]
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
@@ -4519,7 +3345,7 @@ float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4547,7 +3373,7 @@ float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
 // CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.float64x2x4_t [[TMP19]]
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
@@ -4563,7 +3389,7 @@ float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
@@ -4582,7 +3408,7 @@ float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
 // CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
@@ -4598,7 +3424,7 @@ poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
@@ -4626,7 +3452,7 @@ poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
 // CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
@@ -4642,7 +3468,7 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
@@ -4670,7 +3496,7 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
@@ -4686,7 +3512,7 @@ poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -4705,7 +3531,7 @@ poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
@@ -4721,7 +3547,7 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
@@ -4749,7 +3575,7 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
@@ -4765,7 +3591,7 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
@@ -4793,7 +3619,7 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
@@ -4809,7 +3635,7 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
@@ -4837,7 +3663,7 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
@@ -4853,7 +3679,7 @@ uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -4872,7 +3698,7 @@ uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int8x8x4_t [[TMP10]]
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
@@ -4888,7 +3714,7 @@ int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
@@ -4916,7 +3742,7 @@ int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int16x4x4_t [[TMP19]]
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
@@ -4932,7 +3758,7 @@ int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
@@ -4960,7 +3786,7 @@ int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
 // CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int32x2x4_t [[TMP19]]
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
@@ -4976,7 +3802,7 @@ int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
@@ -5004,7 +3830,7 @@ int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.int64x1x4_t [[TMP19]]
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
@@ -5020,7 +3846,7 @@ int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
@@ -5048,7 +3874,7 @@ int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float16x4x4_t [[TMP19]]
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
@@ -5064,7 +3890,7 @@ float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
@@ -5092,7 +3918,7 @@ float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
 // CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float32x2x4_t [[TMP19]]
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
@@ -5108,7 +3934,7 @@ float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
@@ -5136,7 +3962,7 @@ float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
 // CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.float64x1x4_t [[TMP19]]
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
@@ -5152,7 +3978,7 @@ float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
@@ -5171,7 +3997,7 @@ float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
 // CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
 // CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
 // CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
@@ -5187,7 +4013,7 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
@@ -5215,7 +4041,7 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
 // CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
@@ -5231,7 +4057,7 @@ poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
@@ -5259,7 +4085,7 @@ poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
 // CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
@@ -5585,7 +4411,7 @@ void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -5605,7 +4431,7 @@ void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -5630,7 +4456,7 @@ void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -5655,7 +4481,7 @@ void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -5680,7 +4506,7 @@ void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -5700,7 +4526,7 @@ void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -5725,7 +4551,7 @@ void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
 // CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -5750,7 +4576,7 @@ void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -5775,7 +4601,7 @@ void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -5800,7 +4626,7 @@ void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
 // CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -5825,7 +4651,7 @@ void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
 // CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -5850,7 +4676,7 @@ void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
 // CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -5870,7 +4696,7 @@ void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
 // CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -5895,7 +4721,7 @@ void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
 // CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -5920,7 +4746,7 @@ void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -5940,7 +4766,7 @@ void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -5965,7 +4791,7 @@ void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -5990,7 +4816,7 @@ void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6015,7 +4841,7 @@ void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6035,7 +4861,7 @@ void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6060,7 +4886,7 @@ void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
 // CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -6085,7 +4911,7 @@ void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6110,7 +4936,7 @@ void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -6135,7 +4961,7 @@ void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
 // CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -6160,7 +4986,7 @@ void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
 // CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -6185,7 +5011,7 @@ void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
 // CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6205,7 +5031,7 @@ void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
 // CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6230,7 +5056,7 @@ void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
 // CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6255,7 +5081,7 @@ void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -6278,7 +5104,7 @@ void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -6308,7 +5134,7 @@ void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -6338,7 +5164,7 @@ void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -6368,7 +5194,7 @@ void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -6391,7 +5217,7 @@ void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -6421,7 +5247,7 @@ void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
 // CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -6451,7 +5277,7 @@ void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -6481,7 +5307,7 @@ void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -6511,7 +5337,7 @@ void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
 // CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -6541,7 +5367,7 @@ void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
 // CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -6571,7 +5397,7 @@ void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
 // CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -6594,7 +5420,7 @@ void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
 // CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -6624,7 +5450,7 @@ void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
 // CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -6654,7 +5480,7 @@ void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6677,7 +5503,7 @@ void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6707,7 +5533,7 @@ void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -6737,7 +5563,7 @@ void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6767,7 +5593,7 @@ void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6790,7 +5616,7 @@ void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -6820,7 +5646,7 @@ void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
 // CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -6850,7 +5676,7 @@ void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -6880,7 +5706,7 @@ void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -6910,7 +5736,7 @@ void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
 // CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -6940,7 +5766,7 @@ void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
 // CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -6970,7 +5796,7 @@ void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
 // CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -6993,7 +5819,7 @@ void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
 // CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7023,7 +5849,7 @@ void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
 // CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -7053,7 +5879,7 @@ void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -7079,7 +5905,7 @@ void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -7114,7 +5940,7 @@ void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -7149,7 +5975,7 @@ void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -7184,7 +6010,7 @@ void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -7210,7 +6036,7 @@ void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -7245,7 +6071,7 @@ void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
 // CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
@@ -7280,7 +6106,7 @@ void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -7315,7 +6141,7 @@ void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
@@ -7350,7 +6176,7 @@ void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
 // CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
@@ -7385,7 +6211,7 @@ void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
 // CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
@@ -7420,7 +6246,7 @@ void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
 // CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -7446,7 +6272,7 @@ void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
 // CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
@@ -7481,7 +6307,7 @@ void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
 // CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
@@ -7516,7 +6342,7 @@ void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -7542,7 +6368,7 @@ void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7577,7 +6403,7 @@ void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -7612,7 +6438,7 @@ void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -7647,7 +6473,7 @@ void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -7673,7 +6499,7 @@ void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7708,7 +6534,7 @@ void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
 // CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
@@ -7743,7 +6569,7 @@ void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
@@ -7778,7 +6604,7 @@ void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
@@ -7813,7 +6639,7 @@ void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
 // CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
@@ -7848,7 +6674,7 @@ void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
 // CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
@@ -7883,7 +6709,7 @@ void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
 // CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
 // CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -7909,7 +6735,7 @@ void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
 // CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
@@ -7944,7 +6770,7 @@ void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
 // CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
diff --git a/test/CodeGen/aarch64-neon-misc.c b/test/CodeGen/aarch64-neon-misc.c
index 0772d4fc677d..f8ba7ee7121b 100644
--- a/test/CodeGen/aarch64-neon-misc.c
+++ b/test/CodeGen/aarch64-neon-misc.c
@@ -2253,22 +2253,6 @@ float64x2_t test_vcvt_high_f64_f32(float32x4_t a) {
   return vcvt_high_f64_f32(a);
 }
 
-// CHECK-LABEL: @test_vrndn_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRNDN1_I]]
-float32x2_t test_vrndn_f32(float32x2_t a) {
-  return vrndn_f32(a);
-}
-
-// CHECK-LABEL: @test_vrndnq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRNDN1_I]]
-float32x4_t test_vrndnq_f32(float32x4_t a) {
-  return vrndnq_f32(a);
-}
-
 // CHECK-LABEL: @test_vrndnq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a)
@@ -2277,22 +2261,6 @@ float64x2_t test_vrndnq_f64(float64x2_t a) {
   return vrndnq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrnda_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRNDA1_I]]
-float32x2_t test_vrnda_f32(float32x2_t a) {
-  return vrnda_f32(a);
-}
-
-// CHECK-LABEL: @test_vrndaq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRNDA1_I]]
-float32x4_t test_vrndaq_f32(float32x4_t a) {
-  return vrndaq_f32(a);
-}
-
 // CHECK-LABEL: @test_vrndaq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a)
@@ -2301,22 +2269,6 @@ float64x2_t test_vrndaq_f64(float64x2_t a) {
   return vrndaq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndp_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRNDP1_I]]
-float32x2_t test_vrndp_f32(float32x2_t a) {
-  return vrndp_f32(a);
-}
-
-// CHECK-LABEL: @test_vrndpq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRNDP1_I]]
-float32x4_t test_vrndpq_f32(float32x4_t a) {
-  return vrndpq_f32(a);
-}
-
 // CHECK-LABEL: @test_vrndpq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a)
@@ -2325,22 +2277,6 @@ float64x2_t test_vrndpq_f64(float64x2_t a) {
   return vrndpq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndm_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRNDM1_I]]
-float32x2_t test_vrndm_f32(float32x2_t a) {
-  return vrndm_f32(a);
-}
-
-// CHECK-LABEL: @test_vrndmq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRNDM1_I]]
-float32x4_t test_vrndmq_f32(float32x4_t a) {
-  return vrndmq_f32(a);
-}
-
 // CHECK-LABEL: @test_vrndmq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> %a)
@@ -2349,22 +2285,6 @@ float64x2_t test_vrndmq_f64(float64x2_t a) {
   return vrndmq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndx_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRNDX1_I]]
-float32x2_t test_vrndx_f32(float32x2_t a) {
-  return vrndx_f32(a);
-}
-
-// CHECK-LABEL: @test_vrndxq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRNDX1_I]]
-float32x4_t test_vrndxq_f32(float32x4_t a) {
-  return vrndxq_f32(a);
-}
-
 // CHECK-LABEL: @test_vrndxq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> %a)
@@ -2373,22 +2293,6 @@ float64x2_t test_vrndxq_f64(float64x2_t a) {
   return vrndxq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrnd_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRNDZ1_I]]
-float32x2_t test_vrnd_f32(float32x2_t a) {
-  return vrnd_f32(a);
-}
-
-// CHECK-LABEL: @test_vrndq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRNDZ1_I]]
-float32x4_t test_vrndq_f32(float32x4_t a) {
-  return vrndq_f32(a);
-}
-
 // CHECK-LABEL: @test_vrndq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a)
@@ -2397,22 +2301,6 @@ float64x2_t test_vrndq_f64(float64x2_t a) {
   return vrndq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndi_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRNDI1_I]]
-float32x2_t test_vrndi_f32(float32x2_t a) {
-  return vrndi_f32(a);
-}
-
-// CHECK-LABEL: @test_vrndiq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRNDI1_I]]
-float32x4_t test_vrndiq_f32(float32x4_t a) {
-  return vrndiq_f32(a);
-}
-
 // CHECK-LABEL: @test_vrndiq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a)
diff --git a/test/CodeGen/aarch64-neon-perm.c b/test/CodeGen/aarch64-neon-perm.c
index 471017a99bbd..c5d5ab18070c 100644
--- a/test/CodeGen/aarch64-neon-perm.c
+++ b/test/CodeGen/aarch64-neon-perm.c
@@ -888,18 +888,14 @@ poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -912,9 +908,8 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -923,9 +918,6 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -938,9 +930,8 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 
 // CHECK-LABEL: @test_vuzp_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -949,9 +940,6 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -964,18 +952,14 @@ int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
 
 // CHECK-LABEL: @test_vuzp_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -988,9 +972,8 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -999,9 +982,6 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1014,9 +994,8 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 
 // CHECK-LABEL: @test_vuzp_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1025,9 +1004,6 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1040,9 +1016,8 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 
 // CHECK-LABEL: @test_vuzp_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
@@ -1051,9 +1026,6 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1066,18 +1038,14 @@ float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
 
 // CHECK-LABEL: @test_vuzp_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -1090,9 +1058,8 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 
 // CHECK-LABEL: @test_vuzp_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1101,9 +1068,6 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -1116,18 +1080,14 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -1140,9 +1100,8 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 
 // CHECK-LABEL: @test_vuzpq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1151,9 +1110,6 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -1166,9 +1122,8 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 
 // CHECK-LABEL: @test_vuzpq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1177,9 +1132,6 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -1192,18 +1144,14 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -1216,9 +1164,8 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 
 // CHECK-LABEL: @test_vuzpq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1227,9 +1174,6 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -1242,9 +1186,8 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 
 // CHECK-LABEL: @test_vuzpq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1253,9 +1196,6 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -1268,9 +1208,8 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
@@ -1279,9 +1218,6 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -1294,18 +1230,14 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vuzpq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 // CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -1318,9 +1250,8 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 
 // CHECK-LABEL: @test_vuzpq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1329,9 +1260,6 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
@@ -1344,18 +1272,14 @@ poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
 
 // CHECK-LABEL: @test_vzip_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -1368,9 +1292,8 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 
 // CHECK-LABEL: @test_vzip_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1379,9 +1302,6 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -1394,9 +1314,8 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 
 // CHECK-LABEL: @test_vzip_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1405,9 +1324,6 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -1420,18 +1336,14 @@ int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
 
 // CHECK-LABEL: @test_vzip_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -1444,9 +1356,8 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 
 // CHECK-LABEL: @test_vzip_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1455,9 +1366,6 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1470,9 +1378,8 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 
 // CHECK-LABEL: @test_vzip_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1481,9 +1388,6 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1496,9 +1400,8 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 
 // CHECK-LABEL: @test_vzip_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
@@ -1507,9 +1410,6 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1522,18 +1422,14 @@ float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
 
 // CHECK-LABEL: @test_vzip_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -1546,9 +1442,8 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 
 // CHECK-LABEL: @test_vzip_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1557,9 +1452,6 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -1572,18 +1464,14 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -1596,9 +1484,8 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 
 // CHECK-LABEL: @test_vzipq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1607,9 +1494,6 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -1622,9 +1506,8 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 
 // CHECK-LABEL: @test_vzipq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1633,9 +1516,6 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -1648,18 +1528,14 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -1672,9 +1548,8 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 
 // CHECK-LABEL: @test_vzipq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1683,9 +1558,6 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -1698,9 +1570,8 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 
 // CHECK-LABEL: @test_vzipq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -1709,9 +1580,6 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -1724,9 +1592,8 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
@@ -1735,9 +1602,6 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -1750,18 +1614,14 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vzipq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 // CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -1774,9 +1634,8 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 
 // CHECK-LABEL: @test_vzipq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -1785,9 +1644,6 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
@@ -1800,18 +1656,14 @@ poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
@@ -1824,9 +1676,8 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1835,9 +1686,6 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
@@ -1850,9 +1698,8 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 
 // CHECK-LABEL: @test_vtrn_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1861,9 +1708,6 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
@@ -1876,18 +1720,14 @@ int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
 
 // CHECK-LABEL: @test_vtrn_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
@@ -1900,9 +1740,8 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -1911,9 +1750,6 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
@@ -1926,9 +1762,8 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 
 // CHECK-LABEL: @test_vtrn_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
@@ -1937,9 +1772,6 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
@@ -1952,9 +1784,8 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 
 // CHECK-LABEL: @test_vtrn_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
@@ -1963,9 +1794,6 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
 // CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
@@ -1978,18 +1806,14 @@ float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
 
 // CHECK-LABEL: @test_vtrn_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
@@ -2002,9 +1826,8 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 
 // CHECK-LABEL: @test_vtrn_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
@@ -2013,9 +1836,6 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
@@ -2028,18 +1848,14 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_s8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
@@ -2052,9 +1868,8 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 
 // CHECK-LABEL: @test_vtrnq_s16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -2063,9 +1878,6 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
@@ -2078,9 +1890,8 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 
 // CHECK-LABEL: @test_vtrnq_s32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -2089,9 +1900,6 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
@@ -2104,18 +1912,14 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_u8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
@@ -2128,9 +1932,8 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 
 // CHECK-LABEL: @test_vtrnq_u16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -2139,9 +1942,6 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
@@ -2154,9 +1954,8 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 
 // CHECK-LABEL: @test_vtrnq_u32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
@@ -2165,9 +1964,6 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
@@ -2180,9 +1976,8 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_f32(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
@@ -2191,9 +1986,6 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
@@ -2206,18 +1998,14 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vtrnq_p8(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 // CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
@@ -2230,9 +2018,8 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 
 // CHECK-LABEL: @test_vtrnq_p16(
 // CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
 // CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
@@ -2241,9 +2028,6 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
 // CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
 // CHECK:   [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
 // CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
diff --git a/test/CodeGen/aarch64-poly64.c b/test/CodeGen/aarch64-poly64.c
index 3fb8048fb39e..b70e5f0765e2 100644
--- a/test/CodeGen/aarch64-poly64.c
+++ b/test/CodeGen/aarch64-poly64.c
@@ -248,7 +248,7 @@ void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
 // CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
@@ -266,7 +266,7 @@ poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
 // CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
@@ -284,7 +284,7 @@ poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
@@ -302,7 +302,7 @@ poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
@@ -320,7 +320,7 @@ poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
 // CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
 // CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
@@ -338,7 +338,7 @@ poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
 // CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
 // CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
 // CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
@@ -352,7 +352,7 @@ poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
 // CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 0
@@ -377,7 +377,7 @@ void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
 // CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 0
@@ -402,7 +402,7 @@ void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
 // CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 0
@@ -432,7 +432,7 @@ void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
 // CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 0
@@ -462,7 +462,7 @@ void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
 // CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 0
@@ -497,7 +497,7 @@ void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
 // CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
 // CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 0
diff --git a/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c b/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
new file mode 100644
index 000000000000..b8e1f92a2579
--- /dev/null
+++ b/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
@@ -0,0 +1,659 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +fullfp16\
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_fp16.h>
+
+// CHECK-LABEL: test_vabsh_f16
+// CHECK:  [[ABS:%.*]] =  call half @llvm.fabs.f16(half %a)
+// CHECK:  ret half [[ABS]]
+float16_t test_vabsh_f16(float16_t a) {
+  return vabsh_f16(a);
+}
+
+// CHECK-LABEL: test_vceqzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vceqzh_f16(float16_t a) {
+  return vceqzh_f16(a);
+}
+
+// CHECK-LABEL: test_vcgezh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgezh_f16(float16_t a) {
+  return vcgezh_f16(a);
+}
+
+// CHECK-LABEL: test_vcgtzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ogt half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgtzh_f16(float16_t a) {
+  return vcgtzh_f16(a);
+}
+
+// CHECK-LABEL: test_vclezh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vclezh_f16(float16_t a) {
+  return vclezh_f16(a);
+}
+
+// CHECK-LABEL: test_vcltzh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt half %a, 0xH0000
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcltzh_f16(float16_t a) {
+  return vcltzh_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s16
+// CHECK:  [[VCVT:%.*]] = sitofp i16 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s16 (int16_t a) {
+  return vcvth_f16_s16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s32
+// CHECK:  [[VCVT:%.*]] = sitofp i32 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s32 (int32_t a) {
+  return vcvth_f16_s32(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s64
+// CHECK:  [[VCVT:%.*]] = sitofp i64 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_s64 (int64_t a) {
+  return vcvth_f16_s64(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u16
+// CHECK:  [[VCVT:%.*]] = uitofp i16 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u16 (uint16_t a) {
+  return vcvth_f16_u16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u32
+// CHECK:  [[VCVT:%.*]] = uitofp i32 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u32 (uint32_t a) {
+  return vcvth_f16_u32(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u64
+// CHECK:  [[VCVT:%.*]] = uitofp i64 %a to half
+// CHECK:  ret half [[VCVT]]
+float16_t test_vcvth_f16_u64 (uint64_t a) {
+  return vcvth_f16_u64(a);
+}
+
+// CHECK-LABEL: test_vcvth_s16_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i16
+// CHECK:  ret i16 [[VCVT]]
+int16_t test_vcvth_s16_f16 (float16_t a) {
+  return vcvth_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_s32_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i32
+// CHECK:  ret i32 [[VCVT]]
+int32_t test_vcvth_s32_f16 (float16_t a) {
+  return vcvth_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_s64_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %a to i64
+// CHECK:  ret i64 [[VCVT]]
+int64_t test_vcvth_s64_f16 (float16_t a) {
+  return vcvth_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u16_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i16
+// CHECK:  ret i16 [[VCVT]]
+uint16_t test_vcvth_u16_f16 (float16_t a) {
+  return vcvth_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u32_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i32
+// CHECK:  ret i32 [[VCVT]]
+uint32_t test_vcvth_u32_f16 (float16_t a) {
+  return vcvth_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u64_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %a to i64
+// CHECK:  ret i64 [[VCVT]]
+uint64_t test_vcvth_u64_f16 (float16_t a) {
+  return vcvth_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+int16_t test_vcvtah_s16_f16 (float16_t a) {
+  return vcvtah_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtah_s32_f16 (float16_t a) {
+  return vcvtah_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtah_s64_f16 (float16_t a) {
+  return vcvtah_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcvtah_u16_f16 (float16_t a) {
+  return vcvtah_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtah_u32_f16 (float16_t a) {
+  return vcvtah_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtah_u64_f16 (float16_t a) {
+  return vcvtah_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+int16_t test_vcvtmh_s16_f16 (float16_t a) {
+  return vcvtmh_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtmh_s32_f16 (float16_t a) {
+  return vcvtmh_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtmh_s64_f16 (float16_t a) {
+  return vcvtmh_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcvtmh_u16_f16 (float16_t a) {
+  return vcvtmh_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtmh_u32_f16 (float16_t a) {
+  return vcvtmh_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmh_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtmh_u64_f16 (float16_t a) {
+  return vcvtmh_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+int16_t test_vcvtnh_s16_f16 (float16_t a) {
+  return vcvtnh_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtnh_s32_f16 (float16_t a) {
+  return vcvtnh_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtnh_s64_f16 (float16_t a) {
+  return vcvtnh_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcvtnh_u16_f16 (float16_t a) {
+  return vcvtnh_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtnh_u32_f16 (float16_t a) {
+  return vcvtnh_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnh_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtnh_u64_f16 (float16_t a) {
+  return vcvtnh_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+int16_t test_vcvtph_s16_f16 (float16_t a) {
+  return vcvtph_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvtph_s32_f16 (float16_t a) {
+  return vcvtph_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_s64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+int64_t test_vcvtph_s64_f16 (float16_t a) {
+  return vcvtph_s64_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u16_f16
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcvtph_u16_f16 (float16_t a) {
+  return vcvtph_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half %a)
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvtph_u32_f16 (float16_t a) {
+  return vcvtph_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtph_u64_f16
+// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half %a)
+// CHECK: ret i64 [[VCVT]]
+uint64_t test_vcvtph_u64_f16 (float16_t a) {
+  return vcvtph_u64_f16(a);
+}
+
+// CHECK-LABEL: test_vnegh_f16
+// CHECK: [[NEG:%.*]] = fsub half 0xH8000, %a
+// CHECK: ret half [[NEG]]
+float16_t test_vnegh_f16(float16_t a) {
+  return vnegh_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpeh_f16
+// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpe.f16(half %a)
+// CHECK: ret half [[VREC]]
+float16_t test_vrecpeh_f16(float16_t a) {
+  return vrecpeh_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpxh_f16
+// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpx.f16(half %a)
+// CHECK: ret half [[VREC]]
+float16_t test_vrecpxh_f16(float16_t a) {
+  return vrecpxh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.trunc.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndh_f16(float16_t a) {
+  return vrndh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndah_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.round.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndah_f16(float16_t a) {
+  return vrndah_f16(a);
+}
+
+// CHECK-LABEL: test_vrndih_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.nearbyint.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndih_f16(float16_t a) {
+  return vrndih_f16(a);
+}
+
+// CHECK-LABEL: test_vrndmh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.floor.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndmh_f16(float16_t a) {
+  return vrndmh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndnh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.aarch64.neon.frintn.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndnh_f16(float16_t a) {
+  return vrndnh_f16(a);
+}
+
+// CHECK-LABEL: test_vrndph_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.ceil.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndph_f16(float16_t a) {
+  return vrndph_f16(a);
+}
+
+// CHECK-LABEL: test_vrndxh_f16
+// CHECK:  [[RND:%.*]] =  call half @llvm.rint.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrndxh_f16(float16_t a) {
+  return vrndxh_f16(a);
+}
+
+// CHECK-LABEL: test_vrsqrteh_f16
+// CHECK:  [[RND:%.*]] = call half @llvm.aarch64.neon.frsqrte.f16(half %a)
+// CHECK:  ret half [[RND]]
+float16_t test_vrsqrteh_f16(float16_t a) {
+  return vrsqrteh_f16(a);
+}
+
+// CHECK-LABEL: test_vsqrth_f16
+// CHECK:  [[SQR:%.*]] = call half @llvm.sqrt.f16(half %a)
+// CHECK:  ret half [[SQR]]
+float16_t test_vsqrth_f16(float16_t a) {
+  return vsqrth_f16(a);
+}
+
+// CHECK-LABEL: test_vaddh_f16
+// CHECK:  [[ADD:%.*]] = fadd half %a, %b
+// CHECK:  ret half [[ADD]]
+float16_t test_vaddh_f16(float16_t a, float16_t b) {
+  return vaddh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vabdh_f16
+// CHECK:  [[ABD:%.*]] = call half @llvm.aarch64.sisd.fabd.f16(half %a, half %b)
+// CHECK:  ret half [[ABD]]
+float16_t test_vabdh_f16(float16_t a, float16_t b) {
+  return vabdh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcageh_f16
+// CHECK:  [[FACG:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f16(half %a, half %b)
+// CHECK: [[RET:%.*]] = trunc i32 [[FACG]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcageh_f16(float16_t a, float16_t b) {
+  return vcageh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcagth_f16
+// CHECK:  [[FACG:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f16(half %a, half %b)
+// CHECK: [[RET:%.*]] = trunc i32 [[FACG]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcagth_f16(float16_t a, float16_t b) {
+  return vcagth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcaleh_f16
+// CHECK:  [[FACG:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f16(half %b, half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FACG]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcaleh_f16(float16_t a, float16_t b) {
+  return vcaleh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcalth_f16
+// CHECK:  [[FACG:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f16(half %b, half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FACG]] to i16
+// CHECK: ret i16 [[RET]]
+uint16_t test_vcalth_f16(float16_t a, float16_t b) {
+  return vcalth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vceqh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vceqh_f16(float16_t a, float16_t b) {
+  return vceqh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgeh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgeh_f16(float16_t a, float16_t b) {
+  return vcgeh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgth_f16
+//CHECK:  [[TMP1:%.*]] = fcmp ogt half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcgth_f16(float16_t a, float16_t b) {
+  return vcgth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcleh_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vcleh_f16(float16_t a, float16_t b) {
+  return vcleh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vclth_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt half %a, %b
+// CHECK:  [[TMP2:%.*]] = sext i1 [[TMP1]] to i16
+// CHECK:  ret i16 [[TMP2]]
+uint16_t test_vclth_f16(float16_t a, float16_t b) {
+  return vclth_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s16
+// CHECK: [[SEXT:%.*]] = sext i16 %a to i32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 [[SEXT]], i32 1)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s16(int16_t a) {
+  return vcvth_n_f16_s16(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 1)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s32(int32_t a) {
+  return vcvth_n_f16_s32(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_s64
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64 %a, i32 1)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_s64(int64_t a) {
+  return vcvth_n_f16_s64(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_s16_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 1)
+// CHECK: [[RET:%.*]] = trunc i32 [[CVT]] to i16
+// CHECK: ret i16 [[RET]]
+int16_t test_vcvth_n_s16_f16(float16_t a) {
+  return vcvth_n_s16_f16(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_s32_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 1)
+// CHECK:  ret i32 [[CVT]]
+int32_t test_vcvth_n_s32_f16(float16_t a) {
+  return vcvth_n_s32_f16(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_s64_f16
+// CHECK:  [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 1)
+// CHECK:  ret i64 [[CVT]]
+int64_t test_vcvth_n_s64_f16(float16_t a) {
+  return vcvth_n_s64_f16(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u16
+// CHECK: [[SEXT:%.*]] = zext i16 %a to i32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 [[SEXT]], i32 1)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u16(int16_t a) {
+  return vcvth_n_f16_u16(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u32
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 1)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u32(int32_t a) {
+  return vcvth_n_f16_u32(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_f16_u64
+// CHECK:  [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i64(i64 %a, i32 1)
+// CHECK:  ret half [[CVT]]
+float16_t test_vcvth_n_f16_u64(int64_t a) {
+  return vcvth_n_f16_u64(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_u16_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 1)
+// CHECK: [[RET:%.*]] = trunc i32 [[CVT]] to i16
+// CHECK: ret i16 [[RET]]
+int16_t test_vcvth_n_u16_f16(float16_t a) {
+  return vcvth_n_u16_f16(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_u32_f16
+// CHECK:  [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 1)
+// CHECK:  ret i32 [[CVT]]
+int32_t test_vcvth_n_u32_f16(float16_t a) {
+  return vcvth_n_u32_f16(a, 1);
+}
+
+// CHECK-LABEL: test_vcvth_n_u64_f16
+// CHECK:  [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f16(half %a, i32 1)
+// CHECK:  ret i64 [[CVT]]
+int64_t test_vcvth_n_u64_f16(float16_t a) {
+  return vcvth_n_u64_f16(a, 1);
+}
+
+// CHECK-LABEL: test_vdivh_f16
+// CHECK:  [[DIV:%.*]] = fdiv half %a, %b
+// CHECK:  ret half [[DIV]]
+float16_t test_vdivh_f16(float16_t a, float16_t b) {
+  return vdivh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxh_f16
+// CHECK:  [[MAX:%.*]] = call half @llvm.aarch64.neon.fmax.f16(half %a, half %b)
+// CHECK:  ret half [[MAX]]
+float16_t test_vmaxh_f16(float16_t a, float16_t b) {
+  return vmaxh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxnmh_f16
+// CHECK:  [[MAX:%.*]] = call half @llvm.aarch64.neon.fmaxnm.f16(half %a, half %b)
+// CHECK:  ret half [[MAX]]
+float16_t test_vmaxnmh_f16(float16_t a, float16_t b) {
+  return vmaxnmh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminh_f16
+// CHECK:  [[MIN:%.*]] = call half @llvm.aarch64.neon.fmin.f16(half %a, half %b)
+// CHECK:  ret half [[MIN]]
+float16_t test_vminh_f16(float16_t a, float16_t b) {
+  return vminh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminnmh_f16
+// CHECK:  [[MIN:%.*]] = call half @llvm.aarch64.neon.fminnm.f16(half %a, half %b)
+// CHECK:  ret half [[MIN]]
+float16_t test_vminnmh_f16(float16_t a, float16_t b) {
+  return vminnmh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulh_f16
+// CHECK:  [[MUL:%.*]] = fmul half %a, %b
+// CHECK:  ret half [[MUL]]
+float16_t test_vmulh_f16(float16_t a, float16_t b) {
+  return vmulh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulxh_f16
+// CHECK:  [[MUL:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
+// CHECK:  ret half [[MUL]]
+float16_t test_vmulxh_f16(float16_t a, float16_t b) {
+  return vmulxh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrecpsh_f16
+// CHECK: [[RECPS:%.*]] = call half @llvm.aarch64.neon.frecps.f16(half %a, half %b)
+// CHECK: ret half [[RECPS]]
+float16_t test_vrecpsh_f16(float16_t a, float16_t b) {
+  return vrecpsh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrsqrtsh_f16
+// CHECK:  [[RSQRTS:%.*]] = call half @llvm.aarch64.neon.frsqrts.f16(half %a, half %b)
+// CHECK:  ret half [[RSQRTS]]
+float16_t test_vrsqrtsh_f16(float16_t a, float16_t b) {
+  return vrsqrtsh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vsubh_f16
+// CHECK:  [[SUB:%.*]] = fsub half %a, %b
+// CHECK:  ret half [[SUB]]
+float16_t test_vsubh_f16(float16_t a, float16_t b) {
+  return vsubh_f16(a, b);
+}
+
+// CHECK-LABEL: test_vfmah_f16
+// CHECK:  [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half %c, half %a)
+// CHECK:  ret half [[FMA]]
+float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
+  return vfmah_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vfmsh_f16
+// CHECK:  [[SUB:%.*]] = fsub half 0xH8000, %b
+// CHECK:  [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half %c, half %a)
+// CHECK:  ret half [[ADD]]
+float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
+  return vfmsh_f16(a, b, c);
+}
+
diff --git a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index 3f61238b64fb..e1a2e3fb92dd 100644
--- a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -164,6 +164,13 @@ int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
   return vcvta_s16_f16(a);
 }
 
+// CHECK-LABEL: test_vcvta_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvta_u16_f16 (float16x4_t a) {
+  return vcvta_u16_f16(a);
+}
+
 // CHECK-LABEL: test_vcvtaq_s16_f16
 // CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> %a)
 // CHECK:  ret <8 x i16> [[VCVT]]
@@ -1223,27 +1230,25 @@ float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) {
   return vmulxq_n_f16(a, b);
 }
 
-/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h)
-// CCHECK-LABEL: test_vmulxh_lane_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_lane_f16
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]]
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) {
   return vmulxh_lane_f16(a, b, 3);
 }
 
-// CCHECK-LABEL: test_vmulxh_laneq_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_laneq_f16
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]])
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulxh_laneq_f16(a, b, 7);
 }
-*/
 
 // CHECK-LABEL: test_vmaxv_f16
 // CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
@@ -1350,16 +1355,13 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-LABEL: test_vzip_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 // CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
 float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
   return vzip_f16(a, b);
 }
@@ -1367,16 +1369,13 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: test_vzipq_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 // CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
 float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
   return vzipq_f16(a, b);
 }
@@ -1384,16 +1383,13 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: test_vuzp_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 // CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
 float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
   return vuzp_f16(a, b);
 }
@@ -1401,16 +1397,13 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: test_vuzpq_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 // CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
 float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
   return vuzpq_f16(a, b);
 }
@@ -1418,16 +1411,13 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: test_vtrn_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
 // CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
 // CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
 float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
   return vtrn_f16(a, b);
 }
@@ -1435,16 +1425,13 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: test_vtrnq_f16
 // CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
 // CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]] 
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
 float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
   return vtrnq_f16(a, b);
 }
diff --git a/test/CodeGen/aarch64-varargs-ms.c b/test/CodeGen/aarch64-varargs-ms.c
index f3ff9603c9c2..c27e78570baa 100644
--- a/test/CodeGen/aarch64-varargs-ms.c
+++ b/test/CodeGen/aarch64-varargs-ms.c
@@ -3,7 +3,7 @@
 #include <stdarg.h>
 
 int simple_int(va_list ap) {
-// CHECK-LABEL: define i32 @simple_int
+// CHECK-LABEL: define dso_local i32 @simple_int
   return va_arg(ap, int);
 // CHECK: [[ADDR:%[a-z_0-9]+]] = bitcast i8* %argp.cur to i32*
 // CHECK: [[RESULT:%[a-z_0-9]+]] = load i32, i32* [[ADDR]]
diff --git a/test/CodeGen/address-safety-attr-flavors.cpp b/test/CodeGen/address-safety-attr-flavors.cpp
new file mode 100644
index 000000000000..815cbae78a67
--- /dev/null
+++ b/test/CodeGen/address-safety-attr-flavors.cpp
@@ -0,0 +1,75 @@
+// Make sure the sanitize_address attribute is emitted when using ASan, KASan or
+// HWASan. Either __attribute__((no_sanitize("address")) or
+// __attribute__((no_sanitize("kernel-address")) disables both ASan and KASan
+// instrumentation.
+// Same for __attribute__((no_sanitize("hwaddress")) and
+// __attribute__((no_sanitize("kernel-hwddress")) and HWASan and KHWASan.
+
+// RUN: %clang_cc1 -triple i386-unknown-linux -disable-O0-optnone \
+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NOASAN %s
+
+// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=address \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s | \
+// RUN:   FileCheck -check-prefix=CHECK-ASAN %s
+
+// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=kernel-address \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s | \
+// RUN:   FileCheck -check-prefix=CHECK-KASAN %s
+
+// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=hwaddress \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s | \
+// RUN:   FileCheck -check-prefix=CHECK-HWASAN %s
+
+// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=kernel-hwaddress \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s | \
+// RUN:   FileCheck -check-prefix=CHECK-KHWASAN %s
+
+int HasSanitizeAddress() { return 1; }
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: Function Attrs: noinline nounwind sanitize_address
+// CHECK-KASAN: Function Attrs: noinline nounwind sanitize_address
+// CHECK-HWASAN: Function Attrs: noinline nounwind sanitize_hwaddress
+// CHECK-KHWASAN: Function Attrs: noinline nounwind sanitize_hwaddress
+
+__attribute__((no_sanitize("address"))) int NoSanitizeQuoteAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+// CHECK-KHWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+
+__attribute__((no_sanitize_address)) int NoSanitizeAddress() { return 0; }
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+// CHECK-KHWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+
+__attribute__((no_sanitize("kernel-address"))) int NoSanitizeKernelAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+// CHECK-KHWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+
+__attribute__((no_sanitize("hwaddress"))) int NoSanitizeHWAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KHWASAN: {{Function Attrs: noinline nounwind$}}
+
+__attribute__((no_sanitize("kernel-hwaddress"))) int NoSanitizeKernelHWAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KHWASAN: {{Function Attrs: noinline nounwind$}}
diff --git a/test/CodeGen/address-safety-attr-kasan-hwasan.cpp b/test/CodeGen/address-safety-attr-kasan-hwasan.cpp
deleted file mode 100644
index 7a84b798e4b9..000000000000
--- a/test/CodeGen/address-safety-attr-kasan-hwasan.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Make sure the sanitize_address attribute is emitted when using both ASan and KASan.
-// Also document that __attribute__((no_sanitize_address)) doesn't disable KASan instrumentation.
-
-/// RUN: %clang_cc1 -triple i386-unknown-linux -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NOASAN %s
-/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-ASAN %s
-/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=kernel-address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-KASAN %s
-/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=hwaddress -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-HWASAN %s
-
-int HasSanitizeAddress() {
-  return 1;
-}
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: Function Attrs: noinline nounwind sanitize_address
-// CHECK-KASAN: Function Attrs: noinline nounwind sanitize_address
-// CHECK-HWASAN: Function Attrs: noinline nounwind sanitize_hwaddress
-
-__attribute__((no_sanitize("address")))
-int NoSanitizeQuoteAddress() {
-  return 0;
-}
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
-
-__attribute__((no_sanitize_address))
-int NoSanitizeAddress() {
-  return 0;
-}
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
-
-__attribute__((no_sanitize("kernel-address")))
-int NoSanitizeKernelAddress() {
-  return 0;
-}
-
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
-
-__attribute__((no_sanitize("hwaddress")))
-int NoSanitizeHWAddress() {
-  return 0;
-}
-
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-// CHECK-HWASAN: {{Function Attrs: noinline nounwind$}}
diff --git a/test/CodeGen/address-safety-attr-kasan.cpp b/test/CodeGen/address-safety-attr-kasan.cpp
deleted file mode 100644
index 603134db69fb..000000000000
--- a/test/CodeGen/address-safety-attr-kasan.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Make sure the sanitize_address attribute is emitted when using both ASan and KASan.
-// Also document that __attribute__((no_sanitize_address)) doesn't disable KASan instrumentation.
-
-/// RUN: %clang_cc1 -triple i386-unknown-linux -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NOASAN %s
-/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-ASAN %s
-/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=kernel-address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-KASAN %s
-
-int HasSanitizeAddress() {
-  return 1;
-}
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: Function Attrs: noinline nounwind sanitize_address
-// CHECK-KASAN: Function Attrs: noinline nounwind sanitize_address
-
-__attribute__((no_sanitize("address")))
-int NoSanitizeQuoteAddress() {
-  return 0;
-}
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-
-__attribute__((no_sanitize_address))
-int NoSanitizeAddress() {
-  return 0;
-}
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-
-__attribute__((no_sanitize("kernel-address")))
-int NoSanitizeKernelAddress() {
-  return 0;
-}
-
-// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
-// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
-// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
diff --git a/test/CodeGen/address-sanitizer-and-array-cookie.cpp b/test/CodeGen/address-sanitizer-and-array-cookie.cpp
index ea8953778916..e2267a10c2be 100644
--- a/test/CodeGen/address-sanitizer-and-array-cookie.cpp
+++ b/test/CodeGen/address-sanitizer-and-array-cookie.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -emit-llvm -o - %s | FileCheck %s -check-prefix=PLAIN
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -emit-llvm -o - -fsanitize=address %s | FileCheck %s -check-prefix=ASAN
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -emit-llvm -o - -fsanitize=address -fsanitize-address-poison-class-member-array-new-cookie %s | FileCheck %s -check-prefix=ASAN-POISON-ALL-NEW-ARRAY
 
 typedef __typeof__(sizeof(0)) size_t;
 namespace std {
@@ -8,6 +9,7 @@ namespace std {
 }
 void *operator new[](size_t, const std::nothrow_t &) throw();
 void *operator new[](size_t, char *);
+void *operator new[](size_t, int, int);
 
 struct C {
   int x;
@@ -53,3 +55,10 @@ C *CallPlacementNew() {
 }
 // ASAN-LABEL: CallPlacementNew
 // ASAN-NOT: __asan_poison_cxx_array_cookie
+
+C *CallNewWithArgs() {
+// ASAN-LABEL: CallNewWithArgs
+// ASAN-NOT: call void @__asan_poison_cxx_array_cookie
+// ASAN-POISON-ALL-NEW-ARRAY: call void @__asan_poison_cxx_array_cookie
+  return new (123, 456) C[20];
+}
diff --git a/test/CodeGen/address-space.c b/test/CodeGen/address-space.c
index 28b3954ab7d2..d6a40f33029f 100644
--- a/test/CodeGen/address-space.c
+++ b/test/CodeGen/address-space.c
@@ -1,6 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm < %s | FileCheck -check-prefixes=CHECK,X86,GIZ %s
-// RUN: %clang_cc1 -triple amdgcn -emit-llvm < %s | FileCheck -check-prefixes=CHECK,PIZ %s
-// RUN: %clang_cc1 -triple amdgcn---amdgiz -emit-llvm < %s | FileCheck -check-prefixes=CHECK,AMDGIZ,GIZ %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm < %s | FileCheck -check-prefixes=CHECK,X86 %s
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm < %s | FileCheck -check-prefixes=CHECK,AMDGCN %s
 
 // CHECK: @foo = common addrspace(1) global
 int foo __attribute__((address_space(1)));
@@ -25,12 +24,10 @@ __attribute__((address_space(2))) int *A, *B;
 
 // CHECK-LABEL: define void @test3()
 // X86: load i32 addrspace(2)*, i32 addrspace(2)** @B
-// AMDGIZ: load i32 addrspace(2)*, i32 addrspace(2)** addrspacecast (i32 addrspace(2)* addrspace(1)* @B to i32 addrspace(2)**)
-// PIZ: load i32 addrspace(2)*, i32 addrspace(2)* addrspace(4)* addrspacecast (i32 addrspace(2)* addrspace(1)* @B to i32 addrspace(2)* addrspace(4)*)
+// AMDGCN: load i32 addrspace(2)*, i32 addrspace(2)** addrspacecast (i32 addrspace(2)* addrspace(1)* @B to i32 addrspace(2)**)
 // CHECK: load i32, i32 addrspace(2)*
 // X86: load i32 addrspace(2)*, i32 addrspace(2)** @A
-// AMDGIZ: load i32 addrspace(2)*, i32 addrspace(2)** addrspacecast (i32 addrspace(2)* addrspace(1)* @A to i32 addrspace(2)**)
-// PIZ: load i32 addrspace(2)*, i32 addrspace(2)* addrspace(4)* addrspacecast (i32 addrspace(2)* addrspace(1)* @A to i32 addrspace(2)* addrspace(4)*)
+// AMDGCN: load i32 addrspace(2)*, i32 addrspace(2)** addrspacecast (i32 addrspace(2)* addrspace(1)* @A to i32 addrspace(2)**)
 // CHECK: store i32 {{.*}}, i32 addrspace(2)*
 void test3() {
   *A = *B;
@@ -42,10 +39,8 @@ typedef struct {
 } MyStruct;
 
 // CHECK-LABEL: define void @test4(
-// GIZ: call void @llvm.memcpy.p0i8.p2i8
-// GIZ: call void @llvm.memcpy.p2i8.p0i8
-// PIZ: call void @llvm.memcpy.p4i8.p2i8
-// PIZ: call void @llvm.memcpy.p2i8.p4i8
+// CHECK: call void @llvm.memcpy.p0i8.p2i8
+// CHECK: call void @llvm.memcpy.p2i8.p0i8
 void test4(MyStruct __attribute__((address_space(2))) *pPtr) {
   MyStruct s = pPtr[0];
   pPtr[0] = s;
diff --git a/test/CodeGen/addrsig.c b/test/CodeGen/addrsig.c
new file mode 100644
index 000000000000..d7141092bb35
--- /dev/null
+++ b/test/CodeGen/addrsig.c
@@ -0,0 +1,20 @@
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple=x86_64-unknown-linux -S %s -faddrsig -O -o - | FileCheck --check-prefix=ADDRSIG %s
+// RUN: %clang_cc1 -triple=x86_64-unknown-linux -S %s -O -o - | FileCheck --check-prefix=NO-ADDRSIG %s
+
+// ADDRSIG: .addrsig
+// ADDRSIG: .addrsig_sym g1
+// ADDRSIG-NOT: .addrsig_sym g2
+
+// NO-ADDRSIG-NOT: .addrsig
+
+extern const int g1[], g2[];
+
+const int *f1() {
+  return g1;
+}
+
+int f2() {
+  return g2[0];
+}
diff --git a/test/CodeGen/adx-builtins.c b/test/CodeGen/adx-builtins.c
index 8738c5d17715..ac33367d0ebd 100644
--- a/test/CodeGen/adx-builtins.c
+++ b/test/CodeGen/adx-builtins.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -ffreestanding -target-feature +adx -emit-llvm -o - %s | FileCheck %s
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 unsigned char test_addcarryx_u32(unsigned char __cf, unsigned int __x,
                                  unsigned int __y, unsigned int *__p) {
diff --git a/test/CodeGen/aggregate-assign-call.c b/test/CodeGen/aggregate-assign-call.c
new file mode 100644
index 000000000000..83983722db94
--- /dev/null
+++ b/test/CodeGen/aggregate-assign-call.c
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O1 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=O1
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O0 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=O0
+//
+// Ensure that we place appropriate lifetime markers around indirectly returned
+// temporaries, and that the lifetime.ends appear in a timely manner.
+//
+// -O1 is used so lifetime markers actually get emitted.
+
+struct S {
+  int ns[40];
+};
+
+struct S foo(void);
+
+// CHECK-LABEL: define dso_local void @bar
+struct S bar() {
+  // O0-NOT: @llvm.lifetime.start
+  // O0-NOT: @llvm.lifetime.end
+
+  struct S r;
+  // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP1:[^)]+]])
+  // O1: call void @foo
+  r = foo();
+  // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[TMP1]])
+
+  // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP2:[^)]+]])
+  // O1: call void @foo
+  r = foo();
+  // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[TMP2]])
+
+  // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP3:[^)]+]])
+  // O1: call void @foo
+  r = foo();
+  // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[TMP3]])
+
+  return r;
+}
+
+struct S foo_int(int);
+
+// Be sure that we're placing the lifetime.end so that all paths go through it.
+// Since this function turns out to be large-ish, optnone to hopefully keep it
+// stable.
+// CHECK-LABEL: define dso_local void @baz
+__attribute__((optnone))
+struct S baz(int i, volatile int *j) {
+  // O0-NOT: @llvm.lifetime.start
+  // O0-NOT: @llvm.lifetime.end
+
+  struct S r;
+  // O1: %[[TMP1_ALLOCA:[^ ]+]] = alloca %struct.S
+  // O1: %[[TMP2_ALLOCA:[^ ]+]] = alloca %struct.S
+  // O1: br label %[[DO_BODY:.+]]
+
+  do {
+    // O1: [[DO_BODY]]:
+    // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP1_ALLOCA]] to i8*
+    // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* %[[P]])
+    // O1: br i1 {{[^,]+}}, label %[[IF_THEN:[^,]+]], label %[[IF_END:[^,]+]]
+    //
+    // O1: [[IF_THEN]]:
+    // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP1_ALLOCA]] to i8*
+    // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]])
+    // O1: br label %[[DO_END:.*]]
+    //
+    // O1: [[IF_END]]:
+    // O1: call void @foo_int(%struct.S* sret %[[TMP1_ALLOCA]],
+    // O1: call void @llvm.memcpy
+    // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP1_ALLOCA]] to i8*
+    // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]])
+    // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP2_ALLOCA]] to i8*
+    // O1: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* %[[P]])
+    // O1: call void @foo_int(%struct.S* sret %[[TMP2_ALLOCA]],
+    // O1: call void @llvm.memcpy
+    // O1: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP2_ALLOCA]] to i8*
+    // O1: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]])
+    // O1: br label %[[DO_COND:.*]]
+    //
+    // O1: [[DO_COND]]:
+    // O1: br label %[[DO_BODY]]
+    r = foo_int(({
+      if (*j)
+        break;
+      i++;
+    }));
+
+    r = foo_int(i++);
+   } while (1);
+
+  // O1: [[DO_END]]:
+  // O1-NEXT: ret void
+  return r;
+}
diff --git a/test/CodeGen/alias.c b/test/CodeGen/alias.c
index 6ec12702d86b..78b020454a9f 100644
--- a/test/CodeGen/alias.c
+++ b/test/CodeGen/alias.c
@@ -23,20 +23,20 @@ const int wacom_usb_ids[] = {1, 1, 2, 3, 5, 8, 13, 0};
 extern const int __mod_usb_device_table __attribute__ ((alias("wacom_usb_ids")));
 // CHECKBASIC-DAG: @__mod_usb_device_table = alias i32, getelementptr inbounds ([8 x i32], [8 x i32]* @wacom_usb_ids, i32 0, i32 0)
 // CHECKASM-DAG: .globl __mod_usb_device_table
-// CHECKASM-DAG: __mod_usb_device_table = wacom_usb_ids
+// CHECKASM-DAG: .set __mod_usb_device_table, wacom_usb_ids
 // CHECKASM-NOT: .size __mod_usb_device_table
 
 extern int g1;
 extern int g1 __attribute((alias("g0")));
 // CHECKBASIC-DAG: @g1 = alias i32, i32* @g0
 // CHECKASM-DAG: .globl g1
-// CHECKASM-DAG: g1 = g0
+// CHECKASM-DAG: .set g1, g0
 // CHECKASM-NOT: .size g1
 
 extern __thread int __libc_errno __attribute__ ((alias ("TL_WITH_ALIAS")));
 // CHECKBASIC-DAG: @__libc_errno = thread_local alias i32, i32* @TL_WITH_ALIAS
 // CHECKASM-DAG: .globl __libc_errno
-// CHECKASM-DAG: __libc_errno = TL_WITH_ALIAS
+// CHECKASM-DAG: .set __libc_errno, TL_WITH_ALIAS
 // CHECKASM-NOT: .size __libc_errno
 
 void f0(void) { }
diff --git a/test/CodeGen/arm-aapcs-vfp.c b/test/CodeGen/arm-aapcs-vfp.c
index 38044cbb1e23..ba5a969b0edc 100644
--- a/test/CodeGen/arm-aapcs-vfp.c
+++ b/test/CodeGen/arm-aapcs-vfp.c
@@ -17,11 +17,7 @@
 // RUN:   -ffreestanding \
 // RUN:   -emit-llvm -w -o - %s | FileCheck -check-prefix=CHECK64 %s
 
-#ifdef __arm64__
 #include <arm_neon.h>
-#else
-#include <arm_neon.h>
-#endif
 
 struct homogeneous_struct {
   float f[2];
diff --git a/test/CodeGen/arm-arguments.c b/test/CodeGen/arm-arguments.c
index ec3e1734b0c3..ef4e76054ff8 100644
--- a/test/CodeGen/arm-arguments.c
+++ b/test/CodeGen/arm-arguments.c
@@ -208,13 +208,13 @@ float32x4_t f35(int i, s35_with_align s1, s35_with_align s2) {
 // APCS-GNU: %[[a:.*]] = alloca %struct.s35, align 16
 // APCS-GNU: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
 // APCS-GNU: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
-// APCS-GNU: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// APCS-GNU: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align {{[0-9]+}} %[[b]], i8* align {{[0-9]+}} %[[c]]
 // APCS-GNU: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
 // APCS-GNU: load <4 x float>, <4 x float>* %[[d]], align 16
 // AAPCS-LABEL: define arm_aapcscc <4 x float> @f35(i32 %i, %struct.s35* byval align 8, %struct.s35* byval align 8)
 // AAPCS: %[[a:.*]] = alloca %struct.s35, align 16
 // AAPCS: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
 // AAPCS: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
-// AAPCS: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// AAPCS: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %[[b]], i8* align 8 %[[c]]
 // AAPCS: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
 // AAPCS: load <4 x float>, <4 x float>* %[[d]], align 16
diff --git a/test/CodeGen/arm-build-attributes.c b/test/CodeGen/arm-build-attributes.c
new file mode 100644
index 000000000000..ceb97d153f81
--- /dev/null
+++ b/test/CodeGen/arm-build-attributes.c
@@ -0,0 +1,4 @@
+// RUN: %clang --target=arm-none-eabi -x c - -o - -S < %s -mcpu=cortex-a5 -mfpu=vfpv4-d16 | FileCheck %s
+// REQUIRES: arm-registered-target
+// CHECK: .fpu vfpv4-d16
+void foo() {}
diff --git a/test/CodeGen/arm-fp16-arguments.c b/test/CodeGen/arm-fp16-arguments.c
index 65f076ac3ca8..d739f4b9c66a 100644
--- a/test/CodeGen/arm-fp16-arguments.c
+++ b/test/CodeGen/arm-fp16-arguments.c
@@ -25,3 +25,27 @@ __fp16 t2() { return g; }
 // HARD: ret float [[BITCAST]]
 // NATIVE: [[LOAD:%.*]] = load half, half* @g
 // NATIVE: ret half [[LOAD]]
+
+_Float16 h;
+
+void t3(_Float16 a) { h = a; }
+// SOFT: define void @t3(i32 [[PARAM:%.*]])
+// SOFT: [[TRUNC:%.*]] = trunc i32 [[PARAM]] to i16
+// HARD: define arm_aapcs_vfpcc void @t3(float [[PARAM:%.*]])
+// HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
+// HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
+// CHECK: store i16 [[TRUNC]], i16* bitcast (half* @h to i16*)
+// NATIVE: define void @t3(half [[PARAM:%.*]])
+// NATIVE: store half [[PARAM]], half* @h
+
+_Float16 t4() { return h; }
+// SOFT: define i32 @t4()
+// HARD: define arm_aapcs_vfpcc float @t4()
+// NATIVE: define half @t4()
+// CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @h to i16*)
+// CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
+// SOFT: ret i32 [[ZEXT]]
+// HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
+// HARD: ret float [[BITCAST]]
+// NATIVE: [[LOAD:%.*]] = load half, half* @h
+// NATIVE: ret half [[LOAD]]
diff --git a/test/CodeGen/arm-long-calls.c b/test/CodeGen/arm-long-calls.c
index cff2d6607113..78aee77c8f6d 100644
--- a/test/CodeGen/arm-long-calls.c
+++ b/test/CodeGen/arm-long-calls.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple thumbv7-apple-ios5  -target-feature +long-calls -emit-llvm -o - %s | FileCheck -check-prefix=LONGCALL %s
 // RUN: %clang_cc1 -triple thumbv7-apple-ios5 -emit-llvm -o - %s | FileCheck -check-prefix=NOLONGCALL %s
 
-// LONGCALL: attributes #0 = { {{.*}} "target-features"="+long-calls,+thumb-mode"
-// NOLONGCALL-NOT: attributes #0 = { {{.*}} "target-features"="+long-calls,+thumb-mode"
+// LONGCALL: attributes #0 = { {{.*}} "target-features"="+armv7-a,+long-calls,+thumb-mode"
+// NOLONGCALL-NOT: attributes #0 = { {{.*}} "target-features"="+armv7-a,+long-calls,+thumb-mode"
 
 int foo1(int a) { return a; }
diff --git a/test/CodeGen/arm-neon-directed-rounding.c b/test/CodeGen/arm-neon-directed-rounding.c
index b06808a77cd1..5119f9dbf413 100644
--- a/test/CodeGen/arm-neon-directed-rounding.c
+++ b/test/CodeGen/arm-neon-directed-rounding.c
@@ -1,87 +1,128 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \
+// RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:     opt -S -mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
+// RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \
+// RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:     opt -S -mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A64 %s
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
-// CHECK:   [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a) #2
-// CHECK:   ret <2 x float> [[VRNDA_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a)
+// CHECK-A32: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a)
+// CHECK-A64: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> %a)
+// CHECK: ret <2 x float> [[VRNDA_V1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
   return vrnda_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
-// CHECK:   [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a) #2
-// CHECK:   ret <4 x float> [[VRNDAQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a)
+// CHECK-A32: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a)
+// CHECK-A64: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> %a)
+// CHECK: ret <4 x float> [[VRNDAQ_V1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
   return vrndaq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
-// CHECK:   [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a) #2
-// CHECK:   ret <2 x float> [[VRNDM_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a)
+// CHECK-A32: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a)
+// CHECK-A64: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> %a)
+// CHECK: ret <2 x float> [[VRNDM_V1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
   return vrndm_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
-// CHECK:   [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a) #2
-// CHECK:   ret <4 x float> [[VRNDMQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a)
+// CHECK-A32: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a)
+// CHECK-A64: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> %a)
+// CHECK: ret <4 x float> [[VRNDMQ_V1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
   return vrndmq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
-// CHECK:   [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a) #2
-// CHECK:   ret <2 x float> [[VRNDN_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a)
+// CHECK-A32: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a)
+// CHECK-A64: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a)
+// CHECK: ret <2 x float> [[VRNDN_V1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
   return vrndn_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
-// CHECK:   [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a) #2
-// CHECK:   ret <4 x float> [[VRNDNQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a)
+// CHECK-A32: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a)
+// CHECK-A64: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a)
+// CHECK: ret <4 x float> [[VRNDNQ_V1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
   return vrndnq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
-// CHECK:   [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a) #2
-// CHECK:   ret <2 x float> [[VRNDP_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a)
+// CHECK-A32: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a)
+// CHECK-A64: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> %a)
+// CHECK: ret <2 x float> [[VRNDP_V1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
   return vrndp_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
-// CHECK:   [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a) #2
-// CHECK:   ret <4 x float> [[VRNDPQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a)
+// CHECK-A32: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a)
+// CHECK-A64: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a)
+// CHECK: ret <4 x float> [[VRNDPQ_V1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
   return vrndpq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
-// CHECK:   [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a) #2
-// CHECK:   ret <2 x float> [[VRNDX_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a)
+// CHECK-A32: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a)
+// CHECK-A64: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> %a)
+// CHECK: ret <2 x float> [[VRNDX_V1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
   return vrndx_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
-// CHECK:   [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a) #2
-// CHECK:   ret <4 x float> [[VRNDXQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a)
+// CHECK-A32: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a)
+// CHECK-A64: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> %a)
+// CHECK: ret <4 x float> [[VRNDXQ_V1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
   return vrndxq_f32(a);
 }
 
-// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
-// CHECK:   [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a) #2
-// CHECK:   ret <2 x float> [[VRND_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a)
+// CHECK-A32: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a)
+// CHECK-A64: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> %a)
+// CHECK: ret <2 x float> [[VRND_V1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
   return vrnd_f32(a);
 }
 
-// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
-// CHECK:   [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a) #2
-// CHECK:   ret <4 x float> [[VRNDQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a)
+// CHECK-A32: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a)
+// CHECK-A64: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a)
+// CHECK: ret <4 x float> [[VRNDQ_V1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
   return vrndq_f32(a);
 }
+
+// CHECK-LABEL: define float @test_vrndns_f32(float %a)
+// CHECK-A32: [[VRNDN_I:%.*]] = call float @llvm.arm.neon.vrintn.f32(float %a)
+// CHECK-A64: [[VRNDN_I:%.*]] = call float @llvm.aarch64.neon.frintn.f32(float %a)
+// CHECK: ret float [[VRNDN_I]]
+float32_t test_vrndns_f32(float32_t a) {
+  return vrndns_f32(a);
+}
+
+// CHECK-LABEL: define <2 x float> @test_vrndi_f32(<2 x float> %a)
+// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a)
+// CHECK: ret <2 x float> [[VRNDI1_I]]
+float32x2_t test_vrndi_f32(float32x2_t a) {
+  return vrndi_f32(a);
+}
+
+// CHECK-LABEL: define <4 x float> @test_vrndiq_f32(<4 x float> %a)
+// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a)
+// CHECK: ret <4 x float> [[VRNDI1_I]]
+float32x4_t test_vrndiq_f32(float32x4_t a) {
+  return vrndiq_f32(a);
+}
diff --git a/test/CodeGen/arm-neon-dot-product.c b/test/CodeGen/arm-neon-dot-product.c
new file mode 100644
index 000000000000..34acd02c61c6
--- /dev/null
+++ b/test/CodeGen/arm-neon-dot-product.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple armv8-linux-gnueabihf -target-cpu cortex-a75 -target-feature +dotprod \
+// RUN: -disable-O0-optnone  -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
+
+// REQUIRES: arm-registered-target
+
+// Test ARM v8.2-A dot product intrinsics
+
+#include <arm_neon.h>
+
+uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_u32(a, b, c);
+}
+
+uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_u32(a, b, c);
+}
+
+int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_s32(a, b, c);
+}
+
+int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_s32(a, b, c);
+}
+
+uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_u32(a, b, c, 1);
+}
+
+uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_u32(a, b, c, 1);
+}
+
+int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_s32(a, b, c, 1);
+}
+
+int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_s32(a, b, c, 1);
+}
diff --git a/test/CodeGen/arm-neon-fma.c b/test/CodeGen/arm-neon-fma.c
index b5184c195e40..5f6737aaa771 100644
--- a/test/CodeGen/arm-neon-fma.c
+++ b/test/CodeGen/arm-neon-fma.c
@@ -20,3 +20,27 @@ float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs)
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
   return vfmaq_f32(accum, lhs, rhs);
 }
+
+// CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a)
+// CHECK:   ret <2 x float> [[TMP3]]
+float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
+  return vfma_n_f32(a, b, n);
+}
+
+// CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a)
+// CHECK:   ret <4 x float> [[TMP3]]
+float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
+  return vfmaq_n_f32(a, b, n);
+}
diff --git a/test/CodeGen/arm-neon-vld.c b/test/CodeGen/arm-neon-vld.c
new file mode 100644
index 000000000000..2bf55bb1c6cb
--- /dev/null
+++ b/test/CodeGen/arm-neon-vld.c
@@ -0,0 +1,2498 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-A64 %s
+// RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
+// RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-A32 %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: @test_vld1_f16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK-A32: %struct.float16x4x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF:(half|i16)]]*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x2.v4f16.p0f16|arm.neon.vld1x2.v4i16.p0i16}}([[HALF]]* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]> }*
+// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.float16x4x2_t [[TMP6]]
+// CHECK-A32: ret void
+float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
+  return vld1_f16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_f16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK-A32: %struct.float16x4x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x3.v4f16.p0f16|arm.neon.vld1x3.v4i16.p0i16}}([[HALF]]* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
+// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.float16x4x3_t [[TMP6]]
+// CHECK-A32: ret void
+float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
+  return vld1_f16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_f16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK-A32: %struct.float16x4x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x4.v4f16.p0f16|arm.neon.vld1x4.v4i16.p0i16}}([[HALF]]* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
+// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.float16x4x4_t [[TMP6]]
+// CHECK-A32: ret void
+float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
+  return vld1_f16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_f32_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK-A32: %struct.float32x2x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2f32.p0f32(float* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK: store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.float32x2x2_t [[TMP6]]
+// CHECK-A32: ret void
+float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
+  return vld1_f32_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_f32_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK-A32: %struct.float32x2x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2f32.p0f32(float* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.float32x2x3_t [[TMP6]]
+float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
+  return vld1_f32_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_f32_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK-A32: %struct.float32x2x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2f32.p0f32(float* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.float32x2x4_t [[TMP6]]
+// CHECK-A32: ret void
+float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
+  return vld1_f32_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_p16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK-A32: %struct.poly16x4x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.poly16x4x2_t [[TMP6]]
+// CHECK-A32: ret void
+poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
+  return vld1_p16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_p16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK-A32: %struct.poly16x4x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.poly16x4x3_t [[TMP6]]
+// CHECK-A32: ret void
+poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
+  return vld1_p16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_p16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK-A32: %struct.poly16x4x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.poly16x4x4_t [[TMP6]]
+// CHECK-A32: ret void
+poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
+  return vld1_p16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_p8_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK-A32: %struct.poly8x8x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.poly8x8x2_t [[TMP4]]
+// CHECK-A32: ret void
+poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
+  return vld1_p8_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_p8_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK-A32: %struct.poly8x8x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.poly8x8x3_t [[TMP4]]
+// CHECK-A32: ret void
+poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
+  return vld1_p8_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_p8_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK-A32: %struct.poly8x8x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.poly8x8x4_t [[TMP4]]
+// CHECK-A32: ret void
+poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
+  return vld1_p8_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_s16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK-A32: %struct.int16x4x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int16x4x2_t [[TMP6]]
+// CHECK-A32: ret void
+int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
+  return vld1_s16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_s16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK-A32: %struct.int16x4x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int16x4x3_t [[TMP6]]
+// CHECK-A32: ret void
+int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
+  return vld1_s16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_s16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK-A32: %struct.int16x4x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int16x4x4_t [[TMP6]]
+// CHECK-A32: ret void
+int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
+  return vld1_s16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_s32_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK-A32: %struct.int32x2x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int32x2x2_t [[TMP6]]
+// CHECK-A32: ret void
+int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
+  return vld1_s32_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_s32_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK-A32: %struct.int32x2x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int32x2x3_t [[TMP6]]
+// CHECK-A32: ret void
+int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
+  return vld1_s32_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_s32_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK-A32: %struct.int32x2x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int32x2x4_t [[TMP6]]
+// CHECK-A32: ret void
+int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
+  return vld1_s32_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_s64_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK-A32: %struct.int64x1x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v1i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int64x1x2_t [[TMP6]]
+// CHECK-A32: ret void
+int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
+  return vld1_s64_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_s64_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK-A32: %struct.int64x1x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v1i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int64x1x3_t [[TMP6]]
+// CHECK-A32: ret void
+int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
+  return vld1_s64_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_s64_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK-A32: %struct.int64x1x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v1i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int64x1x4_t [[TMP6]]
+// CHECK-A32: ret void
+int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
+  return vld1_s64_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_s8_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK-A32: %struct.int8x8x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int8x8x2_t [[TMP4]]
+// CHECK-A32: ret void
+int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
+  return vld1_s8_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_s8_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK-A32: %struct.int8x8x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int8x8x3_t [[TMP4]]
+// CHECK-A32: ret void
+int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
+  return vld1_s8_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_s8_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK-A32: %struct.int8x8x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.int8x8x4_t [[TMP4]]
+// CHECK-A32: ret void
+int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
+  return vld1_s8_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_u16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK-A32: %struct.uint16x4x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint16x4x2_t [[TMP6]]
+// CHECK-A32: ret void
+uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
+  return vld1_u16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_u16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK-A32: %struct.uint16x4x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint16x4x3_t [[TMP6]]
+// CHECK-A32: ret void
+uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
+  return vld1_u16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_u16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK-A32: %struct.uint16x4x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint16x4x4_t [[TMP6]]
+// CHECK-A32: ret void
+uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
+  return vld1_u16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_u32_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK-A32: %struct.uint32x2x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint32x2x2_t [[TMP6]]
+// CHECK-A32: ret void
+uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
+  return vld1_u32_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_u32_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK-A32: %struct.uint32x2x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint32x2x3_t [[TMP6]]
+// CHECK-A32: ret void
+uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
+  return vld1_u32_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_u32_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK-A32: %struct.uint32x2x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint32x2x4_t [[TMP6]]
+// CHECK-A32: ret void
+uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
+  return vld1_u32_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_u64_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK-A32: %struct.uint64x1x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v1i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint64x1x2_t [[TMP6]]
+// CHECK-A32: ret void
+uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
+  return vld1_u64_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_u64_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK-A32: %struct.uint64x1x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v1i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint64x1x3_t [[TMP6]]
+// CHECK-A32: ret void
+uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
+  return vld1_u64_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_u64_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK-A32: %struct.uint64x1x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v1i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint64x1x4_t [[TMP6]]
+// CHECK-A32: ret void
+uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
+  return vld1_u64_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1_u8_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK-A32: %struct.uint8x8x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint8x8x2_t [[TMP4]]
+// CHECK-A32: ret void
+uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
+  return vld1_u8_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1_u8_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK-A32: %struct.uint8x8x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint8x8x3_t [[TMP4]]
+// CHECK-A32: ret void
+uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
+  return vld1_u8_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1_u8_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK-A32: %struct.uint8x8x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK-A64: ret %struct.uint8x8x4_t [[TMP4]]
+// CHECK-A32: ret void
+uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
+  return vld1_u8_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_f16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK-A32: %struct.float16x8x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x2.v8f16.p0f16|arm.neon.vld1x2.v8i16.p0i16}}([[HALF]]* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]> }*
+// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.float16x8x2_t [[TMP6]]
+// CHECK-A32: ret void
+float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
+  return vld1q_f16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_f16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK-A32: %struct.float16x8x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x3.v8f16.p0f16|arm.neon.vld1x3.v8i16.p0i16}}([[HALF]]* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
+// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.float16x8x3_t [[TMP6]]
+// CHECK-A32: ret void
+float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
+  return vld1q_f16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_f16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK-A32: %struct.float16x8x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to [[HALF]]*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x4.v8f16.p0f16|arm.neon.vld1x4.v8i16.p0i16}}([[HALF]]* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
+// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.float16x8x4_t [[TMP6]]
+// CHECK-A32: ret void
+float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
+  return vld1q_f16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_f32_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK-A32: %struct.float32x4x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4f32.p0f32(float* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK: store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.float32x4x2_t [[TMP6]]
+// CHECK-A32: ret void
+float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
+  return vld1q_f32_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_f32_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK-A32: %struct.float32x4x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4f32.p0f32(float* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.float32x4x3_t [[TMP6]]
+// CHECK-A32: ret void
+float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
+  return vld1q_f32_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_f32_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK-A32: %struct.float32x4x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4f32.p0f32(float* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.float32x4x4_t [[TMP6]]
+// CHECK-A32: ret void
+float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
+  return vld1q_f32_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_p16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK-A32: %struct.poly16x8x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.poly16x8x2_t [[TMP6]]
+// CHECK-A32: ret void
+poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
+  return vld1q_p16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_p16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK-A32: %struct.poly16x8x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.poly16x8x3_t [[TMP6]]
+// CHECK-A32: ret void
+poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
+  return vld1q_p16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_p16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK-A32: %struct.poly16x8x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.poly16x8x4_t [[TMP6]]
+// CHECK-A32: ret void
+poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
+  return vld1q_p16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_p8_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK-A32: %struct.poly8x16x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.poly8x16x2_t [[TMP4]]
+// CHECK-A32: ret void
+poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
+  return vld1q_p8_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_p8_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK-A32: %struct.poly8x16x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.poly8x16x3_t [[TMP4]]
+// CHECK-A32: ret void
+poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
+  return vld1q_p8_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_p8_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK-A32: %struct.poly8x16x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.poly8x16x4_t [[TMP4]]
+// CHECK-A32: ret void
+poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
+  return vld1q_p8_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK-A32: %struct.int16x8x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int16x8x2_t [[TMP6]]
+// CHECK-A32: ret void
+int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
+  return vld1q_s16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK-A32: %struct.int16x8x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int16x8x3_t [[TMP6]]
+// CHECK-A32: ret void
+int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
+  return vld1q_s16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK-A32: %struct.int16x8x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int16x8x4_t [[TMP6]]
+// CHECK-A32: ret void
+int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
+  return vld1q_s16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s32_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK-A32: %struct.int32x4x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int32x4x2_t [[TMP6]]
+// CHECK-A32: ret void
+int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
+  return vld1q_s32_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s32_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK-A32: %struct.int32x4x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int32x4x3_t [[TMP6]]
+// CHECK-A32: ret void
+int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
+  return vld1q_s32_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s32_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK-A32: %struct.int32x4x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int32x4x4_t [[TMP6]]
+// CHECK-A32: ret void
+int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
+  return vld1q_s32_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s64_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK-A32: %struct.int64x2x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int64x2x2_t [[TMP6]]
+// CHECK-A32: ret void
+int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
+  return vld1q_s64_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s64_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK-A32: %struct.int64x2x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int64x2x3_t [[TMP6]]
+// CHECK-A32: ret void
+int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
+  return vld1q_s64_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s64_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK-A32: %struct.int64x2x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int64x2x4_t [[TMP6]]
+// CHECK-A32: ret void
+int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
+  return vld1q_s64_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s8_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK-A32: %struct.int8x16x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int8x16x2_t [[TMP4]]
+// CHECK-A32: ret void
+int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
+  return vld1q_s8_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s8_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK-A32: %struct.int8x16x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int8x16x3_t [[TMP4]]
+// CHECK-A32: ret void
+int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
+  return vld1q_s8_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_s8_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK-A32: %struct.int8x16x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.int8x16x4_t [[TMP4]]
+// CHECK-A32: ret void
+int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
+  return vld1q_s8_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u16_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK-A32: %struct.uint16x8x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint16x8x2_t [[TMP6]]
+// CHECK-A32: ret void
+uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
+  return vld1q_u16_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u16_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK-A32: %struct.uint16x8x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint16x8x3_t [[TMP6]]
+// CHECK-A32: ret void
+uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
+  return vld1q_u16_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u16_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK-A32: %struct.uint16x8x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK: [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i16.p0i16(i16* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint16x8x4_t [[TMP6]]
+// CHECK-A32: ret void
+uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
+  return vld1q_u16_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u32_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK-A32: %struct.uint32x4x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v4i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint32x4x2_t [[TMP6]]
+// CHECK-A32: ret void
+uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
+  return vld1q_u32_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u32_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK-A32: %struct.uint32x4x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v4i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint32x4x3_t [[TMP6]]
+// CHECK-A32: ret void
+uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
+  return vld1q_u32_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u32_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK-A32: %struct.uint32x4x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK: [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v4i32.p0i32(i32* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint32x4x4_t [[TMP6]]
+// CHECK-A32: ret void
+uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
+  return vld1q_u32_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u64_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK-A32: %struct.uint64x2x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v2i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint64x2x2_t [[TMP6]]
+// CHECK-A32: ret void
+uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
+  return vld1q_u64_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u64_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK-A32: %struct.uint64x2x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v2i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint64x2x3_t [[TMP6]]
+// CHECK-A32: ret void
+uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
+  return vld1q_u64_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u64_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK-A32: %struct.uint64x2x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v2i64.p0i64(i64* [[TMP2]])
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint64x2x4_t [[TMP6]]
+// CHECK-A32: ret void
+uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
+  return vld1q_u64_x4(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u8_x2(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK-A32: %struct.uint8x16x2_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint8x16x2_t [[TMP4]]
+// CHECK-A32: ret void
+uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
+  return vld1q_u8_x2(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u8_x3(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK-A32: %struct.uint8x16x3_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint8x16x3_t [[TMP4]]
+// CHECK-A32: ret void
+uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
+  return vld1q_u8_x3(a);
+}
+
+// CHECK-LABEL: @test_vld1q_u8_x4(
+// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK-A32: %struct.uint8x16x4_t* noalias sret [[RETVAL:%.*]],
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
+// CHECK-A64: [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK-A64: ret %struct.uint8x16x4_t [[TMP4]]
+// CHECK-A32: ret void
+uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
+  return vld1q_u8_x4(a);
+}
+
+// CHECK-LABEL: @test_vld2_dup_f16(
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
+// CHECK-A64: [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]> }*
+// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD2]], { <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_f16(float16x4x2_t *dest, const float16_t *src) {
+  *dest = vld2_dup_f16(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_f32(
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK-A64: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2dup.v2f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_f32(float32x2x2_t *dest, const float32_t *src) {
+  *dest = vld2_dup_f32(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_p16(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_p16(poly16x4x2_t *dest, const poly16_t *src) {
+  *dest = vld2_dup_p16(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_p8(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_p8(poly8x8x2_t *dest, poly8_t *src) {
+  *dest = vld2_dup_p8(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_s16(
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_s16(int16x4x2_t *dest, const int16_t *src) {
+  *dest = vld2_dup_s16(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_s32(
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_s32(int32x2x2_t *dest, const int32_t *src) {
+  *dest = vld2_dup_s32(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_s8(
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_s8(int8x8x2_t *dest, int8_t *src) {
+  *dest = vld2_dup_s8(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_u16(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_u16(uint16x4x2_t *dest, const uint16_t *src) {
+  *dest = vld2_dup_u16(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_u32(
+// CHECK: entry:
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_u32(uint32x2x2_t *dest, const uint32_t *src) {
+  *dest = vld2_dup_u32(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_s64(
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-A64: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_s64(int64x1x2_t *dest, const int64_t *src) {
+  *dest = vld2_dup_s64(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_u64(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-A64: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_u64(uint64x1x2_t *dest, const uint64_t *src) {
+  *dest = vld2_dup_u64(src);
+}
+
+// CHECK-LABEL: @test_vld2_dup_u8(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 16, i1 false)
+// CHECK: ret void
+void test_vld2_dup_u8(uint8x8x2_t *dest, const uint8_t *src) {
+  *dest = vld2_dup_u8(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_f16(
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
+// CHECK-A64: [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
+// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD3]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_f16(float16x4x3_t *dest, float16_t *src) {
+  *dest = vld3_dup_f16(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_f32(
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK-A64: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3dup.v2f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_f32(float32x2x3_t *dest, const float32_t *src) {
+  *dest = vld3_dup_f32(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_p16(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_p16(poly16x4x3_t *dest, const poly16_t *src) {
+  *dest = vld3_dup_p16(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_p8(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_p8(poly8x8x3_t *dest, const poly8_t *src) {
+  *dest = vld3_dup_p8(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_s16(
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_s16(int16x4x3_t *dest, const int16_t *src) {
+  *dest = vld3_dup_s16(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_s32(
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_s32(int32x2x3_t *dest, const int32_t *src) {
+  *dest = vld3_dup_s32(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_s8(
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_s8(int8x8x3_t *dest, const int8_t *src) {
+  *dest = vld3_dup_s8(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_u16(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_u16(uint16x4x3_t *dest, const uint16_t *src) {
+  *dest = vld3_dup_u16(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_u32(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_u32(uint32x2x3_t *dest, const uint32_t *src) {
+  *dest = vld3_dup_u32(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_u8(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_u8(uint8x8x3_t *dest, const uint8_t *src) {
+  *dest = vld3_dup_u8(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_s64(
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-A64: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_s64(int64x1x3_t *dest, const int64_t *src) {
+  *dest = vld3_dup_s64(src);
+}
+
+// CHECK-LABEL: @test_vld3_dup_u64(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-A64: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 24, i1 false)
+// CHECK: ret void
+void test_vld3_dup_u64(uint64x1x3_t *dest, const uint64_t *src) {
+  *dest = vld3_dup_u64(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_f16(
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
+// CHECK-A64: [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
+// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD4]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_f16(float16x4x4_t *dest, const float16_t *src) {
+  *dest = vld4_dup_f16(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_f32(
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK-A64: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4dup.v2f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_f32(float32x2x4_t *dest, const float32_t *src) {
+  *dest = vld4_dup_f32(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_p16(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_p16(poly16x4x4_t *dest, const poly16_t *src) {
+  *dest = vld4_dup_p16(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_p8(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_p8(poly8x8x4_t *dest, const poly8_t *src) {
+  *dest = vld4_dup_p8(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_s16(
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_s16(int16x4x4_t *dest, const int16_t *src) {
+  *dest = vld4_dup_s16(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_s32(
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_s32(int32x2x4_t *dest, const int32_t *src) {
+  *dest = vld4_dup_s32(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_s8(
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_s8(int8x8x4_t *dest, const int8_t *src) {
+  *dest = vld4_dup_s8(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_u16(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_u16(uint16x4x4_t *dest, const uint16_t *src) {
+  *dest = vld4_dup_u16(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_u32(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_u32(uint32x2x4_t *dest, const uint32_t *src) {
+  *dest = vld4_dup_u32(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_u8(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_u8(uint8x8x4_t *dest, const uint8_t *src) {
+  *dest = vld4_dup_u8(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_s64(
+// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-A64: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_s64(int64x1x4_t *dest, const int64_t *src) {
+  *dest = vld4_dup_s64(src);
+}
+
+// CHECK-LABEL: @test_vld4_dup_u64(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i64* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-A64: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* [[TMP1]], i32 8)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld4_dup_u64(uint64x1x4_t *dest, const uint64_t *src) {
+  *dest = vld4_dup_u64(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_f16(
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
+// CHECK-A64: [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]> }*
+// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD2]], { <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_f16(float16x8x2_t *dest, const float16_t *src) {
+  *dest = vld2q_dup_f16(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_f32(
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK-A64: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2dup.v4f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_f32(float32x4x2_t *dest, const float32_t *src) {
+  *dest = vld2q_dup_f32(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_p16(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_p16(poly16x8x2_t *dest, const poly16_t *src) {
+  *dest = vld2q_dup_p16(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_p8(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_p8(poly8x16x2_t *dest, const poly8_t *src) {
+  *dest = vld2q_dup_p8(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_s16(
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_s16(int16x8x2_t *dest, const int16_t *src) {
+  *dest = vld2q_dup_s16(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_s32(
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_s32(int32x4x2_t *dest, const int32_t  *src) {
+  *dest = vld2q_dup_s32(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_s8(
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_s8(int8x16x2_t *dest, const int8_t *src) {
+  *dest = vld2q_dup_s8(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_u16(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_u16(uint16x8x2_t *dest, const uint16_t *src) {
+  *dest = vld2q_dup_u16(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_u32(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_u32(uint32x4x2_t *dest, const uint32_t *src) {
+  *dest = vld2q_dup_u32(src);
+}
+
+// CHECK-LABEL: @test_vld2q_dup_u8(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 32, i1 false)
+// CHECK: ret void
+void test_vld2q_dup_u8(uint8x16x2_t *dest, const uint8_t *src) {
+  *dest = vld2q_dup_u8(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_f16(
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
+// CHECK-A64: [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
+// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD3]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_f16(float16x8x3_t *dest, const float16_t *src) {
+  *dest = vld3q_dup_f16(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_f32(
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK-A64: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3dup.v4f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_f32(float32x4x3_t *dest, const float32_t *src) {
+  *dest = vld3q_dup_f32(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_p16(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_p16(poly16x8x3_t *dest, const poly16_t *src) {
+  *dest = vld3q_dup_p16(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_p8(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_p8(poly8x16x3_t *dest, const poly8_t *src) {
+  *dest = vld3q_dup_p8(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_s16(
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_s16(int16x8x3_t *dest, const int16_t *src) {
+  *dest = vld3q_dup_s16(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_s32(
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_s32(int32x4x3_t *dest, const int32_t *src) {
+  *dest = vld3q_dup_s32(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_s8(
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_s8(int8x16x3_t *dest, const int8_t *src) {
+  *dest = vld3q_dup_s8(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_u16(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_u16(uint16x8x3_t *dest, const uint16_t *src) {
+  *dest = vld3q_dup_u16(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_u32(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_u32(uint32x4x3_t *dest, const uint32_t *src) {
+  *dest = vld3q_dup_u32(src);
+}
+
+// CHECK-LABEL: @test_vld3q_dup_u8(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 48, i1 false)
+// CHECK: ret void
+void test_vld3q_dup_u8(uint8x16x3_t *dest, const uint8_t *src) {
+  *dest = vld3q_dup_u8(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_f16(
+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast half* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
+// CHECK-A64: [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
+// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD4]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_f16(float16x8x4_t *dest, const float16_t *src) {
+  *dest = vld4q_dup_f16(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_f32(
+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast float* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK-A64: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4dup.v4f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_f32(float32x4x4_t *dest, const float32_t *src) {
+  *dest = vld4q_dup_f32(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_p16(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_p16(poly16x8x4_t *dest, const poly16_t *src) {
+  *dest = vld4q_dup_p16(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_p8(
+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_p8(poly8x16x4_t *dest, const poly8_t *src) {
+  *dest = vld4q_dup_p8(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_s16(
+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_s16(int16x8x4_t *dest, const int16_t *src) {
+  *dest = vld4q_dup_s16(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_s32(
+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_s32(int32x4x4_t *dest, const int32_t *src) {
+  *dest = vld4q_dup_s32(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_s8(
+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_s8(int8x16x4_t *dest, const int8_t *src) {
+  *dest = vld4q_dup_s8(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_u16(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i16* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK-A64: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_u16(uint16x8x4_t *dest, const uint16_t *src) {
+  *dest = vld4q_dup_u16(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_u32(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast i32* %src to i8*
+// CHECK-A64: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK-A64: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK-A32: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* %dest to i8*
+// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP4]], i8* align {{16|8}} [[TMP5]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_u32(uint32x4x4_t *dest, const uint32_t *src) {
+  *dest = vld4q_dup_u32(src);
+}
+
+// CHECK-LABEL: @test_vld4q_dup_u8(
+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align {{16|8}}
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK-A64: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %src)
+// CHECK-A32: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
+// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* %dest to i8*
+// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align {{16|8}} [[TMP2]], i8* align {{16|8}} [[TMP3]], {{i64|i32}} 64, i1 false)
+// CHECK: ret void
+void test_vld4q_dup_u8(uint8x16x4_t *dest, const uint8_t *src) {
+  *dest = vld4q_dup_u8(src);
+}
diff --git a/test/CodeGen/arm-neon-vst.c b/test/CodeGen/arm-neon-vst.c
new file mode 100644
index 000000000000..404c4d9c295c
--- /dev/null
+++ b/test/CodeGen/arm-neon-vst.c
@@ -0,0 +1,2312 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-A64 %s
+// RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
+// RUN:     -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-A32 %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: @test_vst1_f16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x half>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF:(half|i16)]]>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f16.p0f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], half* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
+  vst1_f16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_f16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x half>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f16.p0f16(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], half* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
+  vst1_f16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_f16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x half>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x [[HALF]]>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f16.p0f16(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], half* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
+  vst1_f16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_f32_x2(
+// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x float>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0f32.v2f32(float* [[TMP9]], <2 x float> [[TMP7]], <2 x float> [[TMP8]])
+// CHECK: ret void
+void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
+  vst1_f32_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_f32_x3(
+// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x float>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], float* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0f32.v2f32(float* [[TMP12]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]])
+// CHECK: ret void
+void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
+  vst1_f32_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_f32_x4(
+// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x float>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], float* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0f32.v2f32(float* [[TMP15]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]])
+// CHECK: ret void
+void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
+  vst1_f32_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_p16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i16>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
+  vst1_p16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_p16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i16>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
+  vst1_p16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_p16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i16>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
+  vst1_p16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_p8_x2(
+// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i8>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
+// CHECK: ret void
+void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
+  vst1_p8_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_p8_x3(
+// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i8>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
+// CHECK: ret void
+void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
+  vst1_p8_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_p8_x4(
+// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i8>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
+// CHECK: ret void
+void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
+  vst1_p8_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i16>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
+  vst1_s16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i16>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
+  vst1_s16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i16>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
+  vst1_s16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s32_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i32>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* [[TMP9]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
+// CHECK: ret void
+void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
+  vst1_s32_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s32_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i32>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
+// CHECK: ret void
+void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
+  vst1_s32_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s32_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i32>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* [[TMP15]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
+// CHECK: ret void
+void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
+  vst1_s32_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s64_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <1 x i64>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* [[TMP9]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
+// CHECK: ret void
+void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
+  vst1_s64_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s64_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <1 x i64>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* [[TMP12]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
+// CHECK: ret void
+void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
+  vst1_s64_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s64_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <1 x i64>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* [[TMP15]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
+// CHECK: ret void
+void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
+  vst1_s64_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s8_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i8>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
+// CHECK: ret void
+void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
+  vst1_s8_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s8_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i8>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
+// CHECK: ret void
+void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
+  vst1_s8_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_s8_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i8>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
+// CHECK: ret void
+void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
+  vst1_s8_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i16>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
+  vst1_u16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i16>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
+  vst1_u16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i16>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
+  vst1_u16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u32_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i32>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* [[TMP9]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
+// CHECK: ret void
+void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
+  vst1_u32_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u32_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i32>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
+// CHECK: ret void
+void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
+  vst1_u32_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u32_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i32>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* [[TMP15]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
+// CHECK: ret void
+void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
+  vst1_u32_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u64_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <1 x i64>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* [[TMP9]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
+// CHECK: ret void
+void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
+  vst1_u64_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u64_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <1 x i64>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* [[TMP12]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
+// CHECK: ret void
+void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
+  vst1_u64_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u64_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <1 x i64>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* [[TMP15]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
+// CHECK: ret void
+void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
+  vst1_u64_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u8_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i8>]* %coerce.dive to [2 x i64]*
+// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 16, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
+// CHECK: ret void
+void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
+  vst1_u8_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u8_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i8>]* %coerce.dive to [3 x i64]*
+// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 24, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
+// CHECK: ret void
+void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
+  vst1_u8_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1_u8_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i8>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
+// CHECK: ret void
+void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
+  vst1_u8_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_f16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN:(16|8)]]
+// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x half>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8f16.p0f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], half* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
+  vst1q_f16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_f16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x half>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8f16.p0f16(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], half* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
+  vst1q_f16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_f16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x half>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x [[HALF]]>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to [[HALF]]*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8f16.p0f16(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], half* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
+  vst1q_f16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_f32_x2(
+// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x float>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0f32.v4f32(float* [[TMP9]], <4 x float> [[TMP7]], <4 x float> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
+  vst1q_f32_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_f32_x3(
+// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x float>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], float* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0f32.v4f32(float* [[TMP12]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
+  vst1q_f32_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_f32_x4(
+// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x float>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], float* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0f32.v4f32(float* [[TMP15]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
+  vst1q_f32_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_p16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i16>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
+  vst1q_p16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_p16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i16>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
+  vst1q_p16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_p16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i16>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
+  vst1q_p16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_p8_x2(
+// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <16 x i8>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
+// CHECK: ret void
+void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
+  vst1q_p8_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_p8_x3(
+// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <16 x i8>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
+// CHECK: ret void
+void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
+  vst1q_p8_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_p8_x4(
+// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <16 x i8>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK: ret void
+void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
+  vst1q_p8_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i16>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
+  vst1q_s16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i16>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
+  vst1q_s16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i16>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
+  vst1q_s16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s32_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i32>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* [[TMP9]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
+  vst1q_s32_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s32_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i32>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* [[TMP12]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
+  vst1q_s32_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s32_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i32>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* [[TMP15]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
+  vst1q_s32_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s64_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i64>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* [[TMP9]], <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
+  vst1q_s64_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s64_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i64>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* [[TMP12]], <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
+  vst1q_s64_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s64_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i64>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* [[TMP15]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
+  vst1q_s64_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s8_x2(
+// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <16 x i8>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
+// CHECK: ret void
+void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
+  vst1q_s8_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s8_x3(
+// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <16 x i8>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
+// CHECK: ret void
+void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
+  vst1q_s8_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_s8_x4(
+// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <16 x i8>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK: ret void
+void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
+  vst1q_s8_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u16_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <8 x i16>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
+  vst1q_u16_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u16_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <8 x i16>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
+  vst1q_u16_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u16_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <8 x i16>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
+  vst1q_u16_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u32_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <4 x i32>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* [[TMP9]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
+  vst1q_u32_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u32_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <4 x i32>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* [[TMP12]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
+  vst1q_u32_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u32_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <4 x i32>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* [[TMP15]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
+  vst1q_u32_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u64_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <2 x i64>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* [[TMP9]], <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+// CHECK: ret void
+void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
+  vst1q_u64_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u64_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <2 x i64>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* [[TMP12]], <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+// CHECK: ret void
+void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
+  vst1q_u64_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u64_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <2 x i64>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK-DAG: [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* [[TMP15]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
+// CHECK: ret void
+void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
+  vst1q_u64_x4(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u8_x2(
+// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [2 x <16 x i8>]* %coerce.dive to [4 x i64]*
+// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 32, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
+// CHECK: ret void
+void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
+  vst1q_u8_x2(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u8_x3(
+// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [3 x <16 x i8>]* %coerce.dive to [6 x i64]*
+// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 48, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
+// CHECK: ret void
+void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
+  vst1q_u8_x3(a, b);
+}
+
+// CHECK-LABEL: @test_vst1q_u8_x4(
+// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
+// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
+// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-A32: [[COERCE_DIVE_TMP:%.*]] = bitcast [4 x <16 x i8>]* %coerce.dive to [8 x i64]*
+// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
+// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64|i32}} 64, i1 false)
+// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0
+// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align [[QALIGN]]
+// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1
+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align [[QALIGN]]
+// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2
+// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align [[QALIGN]]
+// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3
+// CHECK: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align [[QALIGN]]
+// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK: ret void
+void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
+  vst1q_u8_x4(a, b);
+}
diff --git a/test/CodeGen/arm-no-movt.c b/test/CodeGen/arm-no-movt.c
index f61f2248643e..bee1752b059c 100644
--- a/test/CodeGen/arm-no-movt.c
+++ b/test/CodeGen/arm-no-movt.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple thumbv7-apple-ios5  -target-feature +no-movt -emit-llvm -o - %s | FileCheck -check-prefix=NO-MOVT %s
 // RUN: %clang_cc1 -triple thumbv7-apple-ios5 -emit-llvm -o - %s | FileCheck -check-prefix=MOVT %s
 
-// NO-MOVT: attributes #0 = { {{.*}} "target-features"="+no-movt,+thumb-mode"
-// MOVT-NOT: attributes #0 = { {{.*}} "target-features"="+no-movt,+thumb-mode"
+// NO-MOVT: attributes #0 = { {{.*}} "target-features"="+armv7-a,+no-movt,+thumb-mode"
+// MOVT-NOT: attributes #0 = { {{.*}} "target-features"="+armv7-a,+no-movt,+thumb-mode"
 
 int foo1(int a) { return a; }
diff --git a/test/CodeGen/arm-swiftcall.c b/test/CodeGen/arm-swiftcall.c
index f5c33845e2f4..53109a3f681e 100644
--- a/test/CodeGen/arm-swiftcall.c
+++ b/test/CodeGen/arm-swiftcall.c
@@ -103,9 +103,7 @@ typedef struct {
 TEST(struct_1);
 // CHECK-LABEL: define {{.*}} @return_struct_1()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
 // CHECK:   @llvm.memset
-// CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i16, \[2 x i8\], float, float }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
 // CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
@@ -170,8 +168,6 @@ typedef struct {
 TEST(struct_2);
 // CHECK-LABEL: define {{.*}} @return_struct_2()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
-// CHECK:   @llvm.memcpy
 // CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i32, float, float }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
@@ -240,9 +236,7 @@ typedef struct {
 TEST(struct_misaligned_1)
 // CHECK-LABEL: define {{.*}} @return_struct_misaligned_1()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
 // CHECK:   @llvm.memset
-// CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i8 }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
 // CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align
@@ -282,8 +276,6 @@ typedef union {
 TEST(union_het_fp)
 // CHECK-LABEL: define {{.*}} @return_union_het_fp()
 // CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align {{(4|8)}}
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align {{(4|8)}}
-// CHECK:   @llvm.memcpy
 // CHECK:   @llvm.memcpy
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i32 }]]*
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
@@ -414,7 +406,6 @@ TEST(int4)
 TEST(int8)
 // CHECK-LABEL: define {{.*}} @return_int8()
 // CHECK:   [[RET:%.*]] = alloca [[REC:<8 x i32>]], align 32
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
 // CHECK:   store
 // CHECK:   load
 // CHECK:   store
@@ -458,7 +449,6 @@ TEST(int8)
 TEST(int5)
 // CHECK-LABEL: define {{.*}} @return_int5()
 // CHECK:   [[RET:%.*]] = alloca [[REC:<5 x i32>]], align 32
-// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
 // CHECK:   store
 // CHECK:   load
 // CHECK:   store
diff --git a/test/CodeGen/arm-target-features.c b/test/CodeGen/arm-target-features.c
index e5591a28ad1a..6bc56be4f4ca 100644
--- a/test/CodeGen/arm-target-features.c
+++ b/test/CodeGen/arm-target-features.c
@@ -1,22 +1,23 @@
 // REQUIRES: arm-registered-target
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3
-// CHECK-VFP3: "target-features"="+dsp,+neon,+thumb-mode
+// CHECK-VFP3: "target-features"="+armv7-a,+dsp,+neon,+thumb-mode,+vfp3"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4
-// CHECK-VFP4: "target-features"="+dsp,+neon,+thumb-mode,+vfp4"
+// CHECK-VFP4: "target-features"="+armv7-a,+dsp,+neon,+thumb-mode,+vfp4"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-a12 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
-// RUN: %clang_cc1 -triple thumbv7s-linux-gnueabi -target-cpu swift -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
+// RUN: %clang_cc1 -triple thumbv7s-linux-gnueabi -target-cpu swift -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-2
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu krait -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
-// CHECK-VFP4-DIV: "target-features"="+dsp,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp4"
+// CHECK-VFP4-DIV: "target-features"="+armv7-a,+dsp,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp4"
+// CHECK-VFP4-DIV-2: "target-features"="+armv7s,+dsp,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp4"
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabihf -target-cpu cortex-a15 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-ARM
 // RUN: %clang_cc1 -triple armv7-linux-gnueabihf -target-cpu cortex-a17 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-ARM
-// CHECK-VFP4-DIV-ARM: "target-features"="+dsp,+hwdiv,+hwdiv-arm,+neon,+vfp4,-thumb-mode"
+// CHECK-VFP4-DIV-ARM: "target-features"="+armv7-a,+dsp,+hwdiv,+hwdiv-arm,+neon,+vfp4,-thumb-mode"
 
 // RUN: %clang_cc1 -triple thumbv7s-apple-ios7.0 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
@@ -27,37 +28,82 @@
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m2 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
-// CHECK-BASIC-V8: "target-features"="+crc,+crypto,+dsp,+fp-armv8,+hwdiv,+hwdiv-arm,+neon,+thumb-mode"
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// CHECK-BASIC-V8: "target-features"="+armv8-a,+crc,+crypto,+dsp,+fp-armv8,+hwdiv,+hwdiv-arm,+neon,+thumb-mode"
 
 // RUN: %clang_cc1 -triple armv8-linux-gnueabi -target-cpu cortex-a53 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8-ARM
-// CHECK-BASIC-V8-ARM: "target-features"="+crc,+crypto,+dsp,+fp-armv8,+hwdiv,+hwdiv-arm,+neon,-thumb-mode"
+// CHECK-BASIC-V8-ARM: "target-features"="+armv8-a,+crc,+crypto,+dsp,+fp-armv8,+hwdiv,+hwdiv-arm,+neon,-thumb-mode"
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-DIV
-// CHECK-VFP3-D16-DIV: "target-features"="+d16,+dsp,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp3"
+// CHECK-VFP3-D16-DIV: "target-features"="+armv7-r,+d16,+dsp,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp3"
 
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabi -target-cpu cortex-r4f -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-THUMB-DIV
-// CHECK-VFP3-D16-THUMB-DIV: "target-features"="+d16,+dsp,+hwdiv,+vfp3,-thumb-mode"
+// CHECK-VFP3-D16-THUMB-DIV: "target-features"="+armv7-r,+d16,+dsp,+hwdiv,+vfp3,-thumb-mode"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
-// CHECK-VFP3-D16-FP16-DIV: "target-features"="+d16,+dsp,+fp16,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp3"
+// CHECK-VFP3-D16-FP16-DIV: "target-features"="+armv7-r,+d16,+dsp,+fp16,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp3"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-m4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-D16-SP-THUMB-DIV
-// CHECK-VFP4-D16-SP-THUMB-DIV: "target-features"="+d16,+dsp,+fp-only-sp,+hwdiv,+thumb-mode,+vfp4"
+// CHECK-VFP4-D16-SP-THUMB-DIV: "target-features"="+armv7e-m,+d16,+dsp,+fp-only-sp,+hwdiv,+thumb-mode,+vfp4"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-m7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP5-D16-THUMB-DIV
-// CHECK-VFP5-D16-THUMB-DIV: "target-features"="+d16,+dsp,+fp-armv8,+hwdiv,+thumb-mode"
+// CHECK-VFP5-D16-THUMB-DIV: "target-features"="+armv7e-m,+d16,+dsp,+fp-armv8,+hwdiv,+thumb-mode"
 
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabi -target-cpu cortex-r4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-THUMB-DIV
-// CHECK-THUMB-DIV: "target-features"="+dsp,+hwdiv,-thumb-mode"
+// CHECK-THUMB-DIV: "target-features"="+armv7-r,+dsp,+hwdiv,-thumb-mode"
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-m3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-THUMB-DIV-M3
-// CHECK-THUMB-DIV-M3: "target-features"="+hwdiv,+thumb-mode"
+// CHECK-THUMB-DIV-M3: "target-features"="+armv7-m,+hwdiv,+thumb-mode"
 
+// (The following test with no arch specified shouldn't happen; the driver
+// rewrites triples.  Just make sure it does something sane.)
+// RUN: %clang_cc1 -triple arm-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARM-NOARCH-LINUX
+// CHECK-ARM-NOARCH-LINUX: "target-features"="-thumb-mode"
+
+// RUN: %clang_cc1 -triple armv4-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV4-LINUX
+// CHECK-ARMV4-LINUX: "target-features"="+armv4,-thumb-mode"
+
+// RUN: %clang_cc1 -triple armv4t-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV4T-LINUX
+// CHECK-ARMV4T-LINUX: "target-features"="+armv4t,-thumb-mode"
+
+// RUN: %clang_cc1 -triple armv5t-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV5T-LINUX
+// CHECK-ARMV5T-LINUX: "target-features"="+armv5t,-thumb-mode"
+
+// RUN: %clang_cc1 -triple armv6-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV6-LINUX
+// CHECK-ARMV6-LINUX: "target-features"="+armv6,-thumb-mode"
+
+// RUN: %clang_cc1 -triple armv6k-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV6K-LINUX
+// CHECK-ARMV6K-LINUX: "target-features"="+armv6k,-thumb-mode"
+
+// RUN: %clang_cc1 -triple arm-linux-gnueabi -target-cpu mpcorenovfp -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV6K-MPCORE-LINUX
+// CHECK-ARMV6K-MPCORE-LINUX: "target-features"="+armv6k,+dsp,-thumb-mode"
+
+// RUN: %clang_cc1 -triple armv6t2-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV6T2-LINUX
+// CHECK-ARMV6T2-LINUX: "target-features"="+armv6t2,-thumb-mode"
+
+// RUN: %clang_cc1 -triple thumbv6m-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV6M-LINUX
+// RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m0 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV6M-LINUX 
+// CHECK-ARMV6M-LINUX: "target-features"="+armv6-m,+thumb-mode"
+
+// RUN: %clang_cc1 -triple thumbv7m-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV7M-LINUX
+// CHECK-ARMV7M-LINUX: "target-features"="+armv7-m,+thumb-mode"
+
+// RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV7M-M3-LINUX 
+// CHECK-ARMV7M-M3-LINUX: "target-features"="+armv7-m,+hwdiv,+thumb-mode"
+
+// RUN: %clang_cc1 -triple thumbv8m.base-linux-gnueabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV8M-LINUX
+// CHECK-ARMV8M-LINUX: "target-features"="+armv8-m.base,+thumb-mode"
+
+// RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m23 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV8M-M23-LINUX 
+// CHECK-ARMV8M-M23-LINUX: "target-features"="+armv8-m.base,+hwdiv,+thumb-mode"
+
+// RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m33 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV8M-MAIN-LINUX 
+// CHECK-ARMV8M-MAIN-LINUX: "target-features"="+armv8-m.main,+d16,+dsp,+fp-armv8,+fp-only-sp,+hwdiv,+thumb-mode"
 
 void foo() {}
diff --git a/test/CodeGen/arm-thumb-mode-target-feature.c b/test/CodeGen/arm-thumb-mode-target-feature.c
index 5c41d0b16fda..2b86529dc634 100644
--- a/test/CodeGen/arm-thumb-mode-target-feature.c
+++ b/test/CodeGen/arm-thumb-mode-target-feature.c
@@ -17,8 +17,8 @@ void t3() {}
 // THUMB: void @t1() [[ThumbAttr:#[0-7]]]
 // THUMB: void @t2() [[NoThumbAttr:#[0-7]]]
 // THUMB: void @t3() [[ThumbAttr:#[0-7]]]
-// THUMB: attributes [[ThumbAttr]] = { {{.*}} "target-features"="+thumb-mode"
-// THUMB: attributes [[NoThumbAttr]] = { {{.*}} "target-features"="-thumb-mode"
+// THUMB: attributes [[ThumbAttr]] = { {{.*}} "target-features"="+armv7-a,+thumb-mode"
+// THUMB: attributes [[NoThumbAttr]] = { {{.*}} "target-features"="+armv7-a,-thumb-mode"
 //
 // THUMB-CLANG: void @t1() [[ThumbAttr:#[0-7]]]
 // THUMB-CLANG: void @t2() [[NoThumbAttr:#[0-7]]]
@@ -29,5 +29,5 @@ void t3() {}
 // ARM: void @t1() [[NoThumbAtr:#[0-7]]]
 // ARM: void @t2() [[NoThumbAttr:#[0-7]]]
 // ARM: void @t3() [[ThumbAttr:#[0-7]]]
-// ARM: attributes [[NoThumbAttr]] = { {{.*}} "target-features"="-thumb-mode"
-// ARM: attributes [[ThumbAttr]] = { {{.*}} "target-features"="+thumb-mode"
+// ARM: attributes [[NoThumbAttr]] = { {{.*}} "target-features"="+armv7-a,-thumb-mode"
+// ARM: attributes [[ThumbAttr]] = { {{.*}} "target-features"="+armv7-a,+thumb-mode"
diff --git a/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/test/CodeGen/arm-v8.2a-neon-intrinsics.c
new file mode 100644
index 000000000000..58d911d3ff70
--- /dev/null
+++ b/test/CodeGen/arm-v8.2a-neon-intrinsics.c
@@ -0,0 +1,989 @@
+// RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: arm-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vabs_f16
+// CHECK:  [[ABS:%.*]] =  call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[ABS]]
+float16x4_t test_vabs_f16(float16x4_t a) {
+  return vabs_f16(a);
+}
+
+// CHECK-LABEL: test_vabsq_f16
+// CHECK:  [[ABS:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[ABS]]
+float16x8_t test_vabsq_f16(float16x8_t a) {
+  return vabsq_f16(a);
+}
+
+// CHECK-LABEL: test_vceqz_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vceqz_f16(float16x4_t a) {
+  return vceqz_f16(a);
+}
+
+// CHECK-LABEL: test_vceqzq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vceqzq_f16(float16x8_t a) {
+  return vceqzq_f16(a);
+}
+
+// CHECK-LABEL: test_vcgez_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vcgez_f16(float16x4_t a) {
+  return vcgez_f16(a);
+}
+
+// CHECK-LABEL: test_vcgezq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vcgezq_f16(float16x8_t a) {
+  return vcgezq_f16(a);
+}
+
+// CHECK-LABEL: test_vcgtz_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vcgtz_f16(float16x4_t a) {
+  return vcgtz_f16(a);
+}
+
+// CHECK-LABEL: test_vcgtzq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vcgtzq_f16(float16x8_t a) {
+  return vcgtzq_f16(a);
+}
+
+// CHECK-LABEL: test_vclez_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vclez_f16(float16x4_t a) {
+  return vclez_f16(a);
+}
+
+// CHECK-LABEL: test_vclezq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vclezq_f16(float16x8_t a) {
+  return vclezq_f16(a);
+}
+
+// CHECK-LABEL: test_vcltz_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vcltz_f16(float16x4_t a) {
+  return vcltz_f16(a);
+}
+
+// CHECK-LABEL: test_vcltzq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, zeroinitializer
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vcltzq_f16(float16x8_t a) {
+  return vcltzq_f16(a);
+}
+
+// CHECK-LABEL: test_vcvt_f16_s16
+// CHECK:  [[VCVT:%.*]] = sitofp <4 x i16> %a to <4 x half>
+// CHECK:  ret <4 x half> [[VCVT]]
+float16x4_t test_vcvt_f16_s16 (int16x4_t a) {
+  return vcvt_f16_s16(a);
+}
+
+// CHECK-LABEL: test_vcvtq_f16_s16
+// CHECK:  [[VCVT:%.*]] = sitofp <8 x i16> %a to <8 x half>
+// CHECK:  ret <8 x half> [[VCVT]]
+float16x8_t test_vcvtq_f16_s16 (int16x8_t a) {
+  return vcvtq_f16_s16(a);
+}
+
+// CHECK-LABEL: test_vcvt_f16_u16
+// CHECK:  [[VCVT:%.*]] = uitofp <4 x i16> %a to <4 x half>
+// CHECK:  ret <4 x half> [[VCVT]]
+float16x4_t test_vcvt_f16_u16 (uint16x4_t a) {
+  return vcvt_f16_u16(a);
+}
+
+// CHECK-LABEL: test_vcvtq_f16_u16
+// CHECK:  [[VCVT:%.*]] = uitofp <8 x i16> %a to <8 x half>
+// CHECK:  ret <8 x half> [[VCVT]]
+float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
+  return vcvtq_f16_u16(a);
+}
+
+// CHECK-LABEL: test_vcvt_s16_f16
+// CHECK:  [[VCVT:%.*]] = fptosi <4 x half> %a to <4 x i16>
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
+  return vcvt_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtq_s16_f16
+// CHECK:  [[VCVT:%.*]] = fptosi <8 x half> %a to <8 x i16>
+// CHECK:  ret <8 x i16> [[VCVT]]
+int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
+  return vcvtq_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvt_u16_f16
+// CHECK:  [[VCVT:%.*]] = fptoui <4 x half> %a to <4 x i16>
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvt_u16_f16 (float16x4_t a) {
+  return vcvt_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtq_u16_f16
+// CHECK:  [[VCVT:%.*]] = fptoui <8 x half> %a to <8 x i16>
+// CHECK:  ret <8 x i16> [[VCVT]]
+int16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
+  return vcvtq_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvta_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtas.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
+  return vcvta_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvta_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtau.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvta_u16_f16 (float16x4_t a) {
+   return vcvta_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtaq_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtas.v8i16.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x i16> [[VCVT]]
+int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
+  return vcvtaq_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtm_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtms.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
+  return vcvtm_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmq_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtms.v8i16.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x i16> [[VCVT]]
+int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
+  return vcvtmq_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtm_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtmu.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
+  return vcvtm_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtmq_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtmu.v8i16.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x i16> [[VCVT]]
+uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
+  return vcvtmq_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtn_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtns.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
+  return vcvtn_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnq_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtns.v8i16.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x i16> [[VCVT]]
+int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
+  return vcvtnq_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtn_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtnu.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
+  return vcvtn_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtnq_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtnu.v8i16.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x i16> [[VCVT]]
+uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
+  return vcvtnq_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtp_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtps.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
+  return vcvtp_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtpq_s16_f16
+// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtps.v8i16.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x i16> [[VCVT]]
+int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
+  return vcvtpq_s16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtp_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtpu.v4i16.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x i16> [[VCVT]]
+uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
+  return vcvtp_u16_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtpq_u16_f16
+// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtpu.v8i16.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x i16> [[VCVT]]
+uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) {
+  return vcvtpq_u16_f16(a);
+}
+
+// FIXME: Fix the zero constant when fp16 non-storage-only type becomes available.
+// CHECK-LABEL: test_vneg_f16
+// CHECK:  [[NEG:%.*]] = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %a
+// CHECK:  ret <4 x half> [[NEG]]
+float16x4_t test_vneg_f16(float16x4_t a) {
+  return vneg_f16(a);
+}
+
+// CHECK-LABEL: test_vnegq_f16
+// CHECK:  [[NEG:%.*]] = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %a
+// CHECK:  ret <8 x half> [[NEG]]
+float16x8_t test_vnegq_f16(float16x8_t a) {
+  return vnegq_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpe_f16
+// CHECK:  [[RCP:%.*]] = call <4 x half> @llvm.arm.neon.vrecpe.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RCP]]
+float16x4_t test_vrecpe_f16(float16x4_t a) {
+  return vrecpe_f16(a);
+}
+
+// CHECK-LABEL: test_vrecpeq_f16
+// CHECK:  [[RCP:%.*]] = call <8 x half> @llvm.arm.neon.vrecpe.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RCP]]
+float16x8_t test_vrecpeq_f16(float16x8_t a) {
+  return vrecpeq_f16(a);
+}
+
+// CHECK-LABEL: test_vrnd_f16
+// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintz.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RND]]
+float16x4_t test_vrnd_f16(float16x4_t a) {
+  return vrnd_f16(a);
+}
+
+// CHECK-LABEL: test_vrndq_f16
+// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintz.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RND]]
+float16x8_t test_vrndq_f16(float16x8_t a) {
+  return vrndq_f16(a);
+}
+
+// CHECK-LABEL: test_vrnda_f16
+// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrinta.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RND]]
+float16x4_t test_vrnda_f16(float16x4_t a) {
+  return vrnda_f16(a);
+}
+
+// CHECK-LABEL: test_vrndaq_f16
+// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrinta.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RND]]
+float16x8_t test_vrndaq_f16(float16x8_t a) {
+  return vrndaq_f16(a);
+}
+
+// CHECK-LABEL: test_vrndm_f16
+// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintm.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RND]]
+float16x4_t test_vrndm_f16(float16x4_t a) {
+  return vrndm_f16(a);
+}
+
+// CHECK-LABEL: test_vrndmq_f16
+// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintm.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RND]]
+float16x8_t test_vrndmq_f16(float16x8_t a) {
+  return vrndmq_f16(a);
+}
+
+// CHECK-LABEL: test_vrndn_f16
+// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintn.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RND]]
+float16x4_t test_vrndn_f16(float16x4_t a) {
+  return vrndn_f16(a);
+}
+
+// CHECK-LABEL: test_vrndnq_f16
+// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintn.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RND]]
+float16x8_t test_vrndnq_f16(float16x8_t a) {
+  return vrndnq_f16(a);
+}
+
+// CHECK-LABEL: test_vrndp_f16
+// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintp.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RND]]
+float16x4_t test_vrndp_f16(float16x4_t a) {
+  return vrndp_f16(a);
+}
+
+// CHECK-LABEL: test_vrndpq_f16
+// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintp.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RND]]
+float16x8_t test_vrndpq_f16(float16x8_t a) {
+  return vrndpq_f16(a);
+}
+
+// CHECK-LABEL: test_vrndx_f16
+// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintx.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RND]]
+float16x4_t test_vrndx_f16(float16x4_t a) {
+  return vrndx_f16(a);
+}
+
+// CHECK-LABEL: test_vrndxq_f16
+// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintx.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RND]]
+float16x8_t test_vrndxq_f16(float16x8_t a) {
+  return vrndxq_f16(a);
+}
+
+// CHECK-LABEL: test_vrsqrte_f16
+// CHECK:  [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrte.v4f16(<4 x half> %a)
+// CHECK:  ret <4 x half> [[RND]]
+float16x4_t test_vrsqrte_f16(float16x4_t a) {
+  return vrsqrte_f16(a);
+}
+
+// CHECK-LABEL: test_vrsqrteq_f16
+// CHECK:  [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrte.v8f16(<8 x half> %a)
+// CHECK:  ret <8 x half> [[RND]]
+float16x8_t test_vrsqrteq_f16(float16x8_t a) {
+  return vrsqrteq_f16(a);
+}
+
+// CHECK-LABEL: test_vadd_f16
+// CHECK:  [[ADD:%.*]] = fadd <4 x half> %a, %b
+// CHECK:  ret <4 x half> [[ADD]]
+float16x4_t test_vadd_f16(float16x4_t a, float16x4_t b) {
+  return vadd_f16(a, b);
+}
+
+// CHECK-LABEL: test_vaddq_f16
+// CHECK:  [[ADD:%.*]] = fadd <8 x half> %a, %b
+// CHECK:  ret <8 x half> [[ADD]]
+float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) {
+  return vaddq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vabd_f16
+// CHECK:  [[ABD:%.*]] = call <4 x half> @llvm.arm.neon.vabds.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[ABD]]
+float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
+  return vabd_f16(a, b);
+}
+
+// CHECK-LABEL: test_vabdq_f16
+// CHECK:  [[ABD:%.*]] = call <8 x half> @llvm.arm.neon.vabds.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x half> [[ABD]]
+float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
+  return vabdq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcage_f16
+// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x i16> [[ABS]]
+uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
+  return vcage_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcageq_f16
+// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x i16> [[ABS]]
+uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
+  return vcageq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcagt_f16
+// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x i16> [[ABS]]
+uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
+  return vcagt_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcagtq_f16
+// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x i16> [[ABS]]
+uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
+  return vcagtq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcale_f16
+// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
+// CHECK:  ret <4 x i16> [[ABS]]
+uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
+  return vcale_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcaleq_f16
+// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
+// CHECK:  ret <8 x i16> [[ABS]]
+uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
+  return vcaleq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcalt_f16
+// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
+// CHECK:  ret <4 x i16> [[ABS]]
+uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
+  return vcalt_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcaltq_f16
+// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
+// CHECK:  ret <8 x i16> [[ABS]]
+uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) {
+  return vcaltq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vceq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vceq_f16(float16x4_t a, float16x4_t b) {
+  return vceq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vceqq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vceqq_f16(float16x8_t a, float16x8_t b) {
+  return vceqq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcge_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vcge_f16(float16x4_t a, float16x4_t b) {
+  return vcge_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgeq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vcgeq_f16(float16x8_t a, float16x8_t b) {
+  return vcgeq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgt_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vcgt_f16(float16x4_t a, float16x4_t b) {
+  return vcgt_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcgtq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vcgtq_f16(float16x8_t a, float16x8_t b) {
+  return vcgtq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcle_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vcle_f16(float16x4_t a, float16x4_t b) {
+  return vcle_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcleq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vcleq_f16(float16x8_t a, float16x8_t b) {
+  return vcleq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vclt_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK:  ret <4 x i16> [[TMP2]]
+uint16x4_t test_vclt_f16(float16x4_t a, float16x4_t b) {
+  return vclt_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcltq_f16
+// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, %b
+// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
+// CHECK:  ret <8 x i16> [[TMP2]]
+uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) {
+  return vcltq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vcvt_n_f16_s16
+// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
+// CHECK:  ret <4 x half> [[CVT]]
+float16x4_t test_vcvt_n_f16_s16(int16x4_t a) {
+  return vcvt_n_f16_s16(a, 2);
+}
+
+// CHECK-LABEL: test_vcvtq_n_f16_s16
+// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
+// CHECK:  ret <8 x half> [[CVT]]
+float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) {
+  return vcvtq_n_f16_s16(a, 2);
+}
+
+// CHECK-LABEL: test_vcvt_n_f16_u16
+// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
+// CHECK:  ret <4 x half> [[CVT]]
+float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) {
+  return vcvt_n_f16_u16(a, 2);
+}
+
+// CHECK-LABEL: test_vcvtq_n_f16_u16
+// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
+// CHECK:  ret <8 x half> [[CVT]]
+float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) {
+  return vcvtq_n_f16_u16(a, 2);
+}
+
+// CHECK-LABEL: test_vcvt_n_s16_f16
+// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
+// CHECK:  ret <4 x i16> [[CVT]]
+int16x4_t test_vcvt_n_s16_f16(float16x4_t a) {
+  return vcvt_n_s16_f16(a, 2);
+}
+
+// CHECK-LABEL: test_vcvtq_n_s16_f16
+// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
+// CHECK:  ret <8 x i16> [[CVT]]
+int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) {
+  return vcvtq_n_s16_f16(a, 2);
+}
+
+// CHECK-LABEL: test_vcvt_n_u16_f16
+// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
+// CHECK:  ret <4 x i16> [[CVT]]
+uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) {
+  return vcvt_n_u16_f16(a, 2);
+}
+
+// CHECK-LABEL: test_vcvtq_n_u16_f16
+// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
+// CHECK:  ret <8 x i16> [[CVT]]
+uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) {
+  return vcvtq_n_u16_f16(a, 2);
+}
+
+// CHECK-LABEL: test_vmax_f16
+// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxs.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MAX]]
+float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
+  return vmax_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxq_f16
+// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxs.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x half> [[MAX]]
+float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
+  return vmaxq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxnm_f16
+// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxnm.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MAX]]
+float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
+  return vmaxnm_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmaxnmq_f16
+// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxnm.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x half> [[MAX]]
+float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
+  return vmaxnmq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmin_f16
+// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vmins.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MIN]]
+float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
+  return vmin_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminq_f16
+// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vmins.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x half> [[MIN]]
+float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
+  return vminq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminnm_f16
+// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vminnm.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MIN]]
+float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
+  return vminnm_f16(a, b);
+}
+
+// CHECK-LABEL: test_vminnmq_f16
+// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x half> [[MIN]]
+float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) {
+  return vminnmq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmul_f16
+// CHECK:  [[MUL:%.*]] = fmul <4 x half> %a, %b
+// CHECK:  ret <4 x half> [[MUL]]
+float16x4_t test_vmul_f16(float16x4_t a, float16x4_t b) {
+  return vmul_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulq_f16
+// CHECK:  [[MUL:%.*]] = fmul <8 x half> %a, %b
+// CHECK:  ret <8 x half> [[MUL]]
+float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) {
+  return vmulq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vpadd_f16
+// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[ADD]]
+float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
+  return vpadd_f16(a, b);
+}
+
+// CHECK-LABEL: test_vpmax_f16
+// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vpmaxs.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MAX]]
+float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
+  return vpmax_f16(a, b);
+}
+
+// CHECK-LABEL: test_vpmin_f16
+// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vpmins.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MIN]]
+float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
+  return vpmin_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrecps_f16
+// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrecps.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MIN]]
+float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
+ return vrecps_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrecpsq_f16
+// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.arm.neon.vrecps.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x half> [[MIN]]
+float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
+  return vrecpsq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrsqrts_f16
+// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrts.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  ret <4 x half> [[MIN]]
+float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
+  return vrsqrts_f16(a, b);
+}
+
+// CHECK-LABEL: test_vrsqrtsq_f16
+// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.arm.neon.vrsqrts.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  ret <8 x half> [[MIN]]
+float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) {
+  return vrsqrtsq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vsub_f16
+// CHECK:  [[ADD:%.*]] = fsub <4 x half> %a, %b 
+// CHECK:  ret <4 x half> [[ADD]]
+float16x4_t test_vsub_f16(float16x4_t a, float16x4_t b) {
+  return vsub_f16(a, b);
+}
+
+// CHECK-LABEL: test_vsubq_f16
+// CHECK:  [[ADD:%.*]] = fsub <8 x half> %a, %b
+// CHECK:  ret <8 x half> [[ADD]]
+float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) {
+  return vsubq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vfma_f16
+// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a)
+// CHECK:  ret <4 x half> [[ADD]]
+float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  return vfma_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vfmaq_f16
+// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
+// CHECK:  ret <8 x half> [[ADD]]
+float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  return vfmaq_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vfms_f16
+// CHECK:  [[SUB:%.*]] = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
+// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a)
+// CHECK:  ret <4 x half> [[ADD]]
+float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  return vfms_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vfmsq_f16
+// CHECK:  [[SUB:%.*]] = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
+// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a)
+// CHECK:  ret <8 x half> [[ADD]]
+float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  return vfmsq_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vmul_lane_f16
+// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
+// CHECK: ret <4 x half> [[MUL]]
+float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
+  return vmul_lane_f16(a, b, 3);
+}
+
+// CHECK-LABEL: test_vmulq_lane_f16
+// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
+// CHECK: ret <8 x half> [[MUL]]
+float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
+  return vmulq_lane_f16(a, b, 7);
+}
+
+// CHECK-LABEL: test_vmul_n_f16
+// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half [[b:%.*]], i32 0
+// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[b]], i32 1
+// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[b]], i32 2
+// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[b]], i32 3
+// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP3]]
+// CHECK: ret <4 x half> [[MUL]]
+float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) {
+  return vmul_n_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmulq_n_f16
+// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half [[b:%.*]], i32 0
+// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[b]], i32 1
+// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[b]], i32 2
+// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[b]], i32 3
+// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[b]], i32 4
+// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[b]], i32 5
+// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[b]], i32 6
+// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[b]], i32 7
+// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP7]]
+// CHECK: ret <8 x half> [[MUL]]
+float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
+  return vmulq_n_f16(a, b);
+}
+
+// CHECK-LABEL: test_vbsl_f16
+// CHECK:  [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:  [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:  [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
+// CHECK:  [[VBSL:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK:  [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL]] to <4 x half>
+// CHECK:  ret <4 x half> [[TMP3]]
+float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
+  return vbsl_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vbslq_f16
+// CHECK:  [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:  [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:  [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
+// CHECK:  [[VBSL:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK:  [[TMP3:%.*]] = bitcast <16 x i8> [[VBSL]] to <8 x half>
+// CHECK:  ret <8 x half> [[TMP3]]
+float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
+  return vbslq_f16(a, b, c);
+}
+
+// CHECK-LABEL: test_vzip_f16
+// CHECK:  [[VZIP0:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:  store <4 x half> [[VZIP0]], <4 x half>* [[addr1:%.*]]
+// CHECK:  [[VZIP1:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:  store <4 x half> [[VZIP1]], <4 x half>* [[addr2:%.*]]
+float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
+  return vzip_f16(a, b);
+}
+
+// CHECK-LABEL: test_vzipq_f16
+// CHECK:  [[VZIP0:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:  store <8 x half> [[VZIP0]], <8 x half>* [[addr1:%.*]]
+// CHECK:  [[VZIP1:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:  store <8 x half> [[VZIP1]], <8 x half>* [[addr2:%.*]]
+float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
+  return vzipq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vuzp_f16
+// CHECK:  [[VUZP0:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:  store <4 x half> [[VUZP0]], <4 x half>* [[addr1:%.*]]
+// CHECK:  [[VUZP1:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:  store <4 x half> [[VUZP1]], <4 x half>* [[addr1:%.*]]
+float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
+  return vuzp_f16(a, b);
+}
+
+// CHECK-LABEL: test_vuzpq_f16
+// CHECK:   [[VUZP0:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x half> [[VUZP0]], <8 x half>* [[addr1:%.*]]
+// CHECK:   [[VUZP1:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x half> [[VUZP1]], <8 x half>* [[addr2:%.*]]
+float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
+  return vuzpq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vtrn_f16
+// CHECK:   [[VTRN0:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x half> [[VTRN0]], <4 x half>* [[addr1:%.*]]
+// CHECK:   [[VTRN1:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x half> [[VTRN1]], <4 x half>* [[addr2:%.*]]
+float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
+  return vtrn_f16(a, b);
+}
+
+// CHECK-LABEL: test_vtrnq_f16
+// CHECK:   [[VTRN0:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x half> [[VTRN0]], <8 x half>* [[addr1:%.*]]
+// CHECK:   [[VTRN1:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x half> [[VTRN1]], <8 x half>* [[addr2:%.*]]
+float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
+  return vtrnq_f16(a, b);
+}
+
+// CHECK-LABEL: test_vmov_n_f16
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> undef, half [[ARG:%.*]], i32 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[ARG]], i32 1
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[ARG]], i32 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[ARG]], i32 3
+// CHECK:   ret <4 x half> [[TMP3]]
+float16x4_t test_vmov_n_f16(float16_t a) {
+  return vmov_n_f16(a);
+}
+
+// CHECK-LABEL: test_vmovq_n_f16
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> undef, half [[ARG:%.*]], i32 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[ARG]], i32 1
+// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[ARG]], i32 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[ARG]], i32 3
+// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[ARG]], i32 4
+// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[ARG]], i32 5
+// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[ARG]], i32 6
+// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[ARG]], i32 7
+// CHECK:   ret <8 x half> [[TMP7]]
+float16x8_t test_vmovq_n_f16(float16_t a) {
+  return vmovq_n_f16(a);
+}
+
+// CHECK-LABEL: test_vdup_n_f16
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> undef, half [[ARG:%.*]], i32 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[ARG]], i32 1
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[ARG]], i32 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[ARG]], i32 3
+// CHECK:   ret <4 x half> [[TMP3]]
+float16x4_t test_vdup_n_f16(float16_t a) {
+  return vdup_n_f16(a);
+}
+
+// CHECK-LABEL: test_vdupq_n_f16
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> undef, half [[ARG:%.*]], i32 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[ARG]], i32 1
+// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[ARG]], i32 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[ARG]], i32 3
+// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[ARG]], i32 4
+// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[ARG]], i32 5
+// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[ARG]], i32 6
+// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[ARG]], i32 7
+// CHECK:   ret <8 x half> [[TMP7]]
+float16x8_t test_vdupq_n_f16(float16_t a) {
+  return vdupq_n_f16(a);
+}
+
+// CHECK-LABEL: test_vdup_lane_f16
+// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x half> [[SHFL]]
+float16x4_t test_vdup_lane_f16(float16x4_t a) {
+  return vdup_lane_f16(a, 3);
+}
+
+// CHECK-LABEL: test_vdupq_lane_f16
+// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x half> [[SHFL]]
+float16x8_t test_vdupq_lane_f16(float16x4_t a) {
+  return vdupq_lane_f16(a, 7);
+}
+
+// CHECK-LABEL: @test_vext_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK:   ret <4 x half> [[VEXT]]
+float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
+  return vext_f16(a, b, 2);
+}
+
+// CHECK-LABEL: @test_vextq_f16(
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK:   ret <8 x half> [[VEXT]]
+float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
+  return vextq_f16(a, b, 5);
+}
+
+// CHECK-LABEL: @test_vrev64_f16(
+// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x half> [[SHFL]]
+float16x4_t test_vrev64_f16(float16x4_t a) {
+  return vrev64_f16(a);
+}
+
+// CHECK-LABEL: @test_vrev64q_f16(
+// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x half> [[SHFL]]
+float16x8_t test_vrev64q_f16(float16x8_t a) {
+  return vrev64q_f16(a);
+}
diff --git a/test/CodeGen/arm64-be-bitfield.c b/test/CodeGen/arm64-be-bitfield.c
index 081eab81e905..cee59b87099f 100644
--- a/test/CodeGen/arm64-be-bitfield.c
+++ b/test/CodeGen/arm64-be-bitfield.c
@@ -7,6 +7,6 @@ signed callee_b0f(struct bt3 bp11) {
 // IR: callee_b0f(i64 [[ARG:%.*]])
 // IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8
 // IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8*
-// IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4
+// IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 8 [[BITCAST]], i64 4
   return bp11.b2;
 }
diff --git a/test/CodeGen/arm64-microsoft-arguments.cpp b/test/CodeGen/arm64-microsoft-arguments.cpp
new file mode 100644
index 000000000000..3ef468880ad7
--- /dev/null
+++ b/test/CodeGen/arm64-microsoft-arguments.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -triple aarch64-windows -ffreestanding -emit-llvm -O0 \
+// RUN: -x c++ -o - %s | FileCheck %s
+
+struct pod { int a, b, c, d, e; };
+
+struct non_pod {
+  int a;
+  non_pod() {}
+};
+
+struct pod s;
+struct non_pod t;
+
+struct pod bar() { return s; }
+struct non_pod foo() { return t; }
+// CHECK: define {{.*}} void @{{.*}}bar{{.*}}(%struct.pod* noalias sret %agg.result)
+// CHECK: define {{.*}} void @{{.*}}foo{{.*}}(%struct.non_pod* noalias %agg.result)
+
+
+// Check instance methods.
+struct pod2 { int x; };
+struct Baz { pod2 baz(); };
+
+int qux() { return Baz().baz().x; }
+// CHECK: declare {{.*}} void @{{.*}}baz@Baz{{.*}}(%struct.Baz*, %struct.pod2*)
diff --git a/test/CodeGen/arm64-microsoft-intrinsics.c b/test/CodeGen/arm64-microsoft-intrinsics.c
index ff802e7f9b85..2dbf1f9ea0f4 100644
--- a/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -24,3 +24,38 @@ void check__isb(void) {
 
 // CHECK-MSVC: @llvm.aarch64.isb(i32 0)
 // CHECK-LINUX: error: implicit declaration of function '__isb'
+
+void check__yield(void) {
+  __yield();
+}
+
+// CHECK-MSVC: @llvm.aarch64.hint(i32 1)
+// CHECK-LINUX: error: implicit declaration of function '__yield'
+
+void check__wfe(void) {
+  __wfe();
+}
+
+// CHECK-MSVC: @llvm.aarch64.hint(i32 2)
+// CHECK-LINUX: error: implicit declaration of function '__wfe'
+
+void check__wfi(void) {
+  __wfi();
+}
+
+// CHECK-MSVC: @llvm.aarch64.hint(i32 3)
+// CHECK-LINUX: error: implicit declaration of function '__wfi'
+
+void check__sev(void) {
+  __sev();
+}
+
+// CHECK-MSVC: @llvm.aarch64.hint(i32 4)
+// CHECK-LINUX: error: implicit declaration of function '__sev'
+
+void check__sevl(void) {
+  __sevl();
+}
+
+// CHECK-MSVC: @llvm.aarch64.hint(i32 5)
+// CHECK-LINUX: error: implicit declaration of function '__sevl'
diff --git a/test/CodeGen/arm64-vrnd.c b/test/CodeGen/arm64-vrnd.c
index 2c1bb8f46a10..7729c094a20c 100644
--- a/test/CodeGen/arm64-vrnd.c
+++ b/test/CodeGen/arm64-vrnd.c
@@ -2,50 +2,21 @@
 
 #include <arm_neon.h>
 
-int32x2_t rnd1(float32x2_t a) { return vrnd_f32(a); }
-// CHECK: call <2 x float> @llvm.trunc.v2f32(<2 x float>
-int32x4_t rnd3(float32x4_t a) { return vrndq_f32(a); }
-// CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float>
 int64x2_t rnd5(float64x2_t a) { return vrndq_f64(a); }
 // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double>
 
-
-int32x2_t rnd7(float32x2_t a) { return vrndn_f32(a); }
-// CHECK: call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>
-int32x4_t rnd8(float32x4_t a) { return vrndnq_f32(a); }
-// CHECK: call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>
 int64x2_t rnd9(float64x2_t a) { return vrndnq_f64(a); }
 // CHECK: call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>
-int64x2_t rnd10(float64x2_t a) { return vrndnq_f64(a); }
-// CHECK: call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>
 
-int32x2_t rnd11(float32x2_t a) { return vrndm_f32(a); }
-// CHECK: call <2 x float> @llvm.floor.v2f32(<2 x float>
-int32x4_t rnd12(float32x4_t a) { return vrndmq_f32(a); }
-// CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float>
 int64x2_t rnd13(float64x2_t a) { return vrndmq_f64(a); }
 // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double>
-int64x2_t rnd14(float64x2_t a) { return vrndmq_f64(a); }
-// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double>
 
-int32x2_t rnd15(float32x2_t a) { return vrndp_f32(a); }
-// CHECK: call <2 x float> @llvm.ceil.v2f32(<2 x float>
-int32x4_t rnd16(float32x4_t a) { return vrndpq_f32(a); }
-// CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float>
 int64x2_t rnd18(float64x2_t a) { return vrndpq_f64(a); }
 // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double>
 
-int32x2_t rnd19(float32x2_t a) { return vrnda_f32(a); }
-// CHECK: call <2 x float> @llvm.round.v2f32(<2 x float>
-int32x4_t rnd20(float32x4_t a) { return vrndaq_f32(a); }
-// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float>
 int64x2_t rnd22(float64x2_t a) { return vrndaq_f64(a); }
 // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double>
 
-int32x2_t rnd23(float32x2_t a) { return vrndx_f32(a); }
-// CHECK: call <2 x float> @llvm.rint.v2f32(<2 x float>
-int32x4_t rnd24(float32x4_t a) { return vrndxq_f32(a); }
-// CHECK: call <4 x float> @llvm.rint.v4f32(<4 x float>
 int64x2_t rnd25(float64x2_t a) { return vrndxq_f64(a); }
 // CHECK: call <2 x double> @llvm.rint.v2f64(<2 x double>
 
diff --git a/test/CodeGen/arm64_vdup.c b/test/CodeGen/arm64_vdup.c
index 8419828b38d2..67eb8553638a 100644
--- a/test/CodeGen/arm64_vdup.c
+++ b/test/CodeGen/arm64_vdup.c
@@ -30,7 +30,7 @@ float16x8_t test_vdupq_n_f16(float16_t *a1) {
   // CHECK-LABEL: test_vdupq_n_f16
   return vdupq_n_f16(*a1);
   // match that an element is inserted into parts 0-7.  The backend better
-  // turn that into a single dup intruction
+  // turn that into a single dup instruction
   // CHECK: insertelement {{.*, i32 0 *$}}
   // CHECK: insertelement {{.*, i32 1 *$}}
   // CHECK: insertelement {{.*, i32 2 *$}}
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index 62888dd73339..f6305062e8fd 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -4732,111 +4732,6 @@ poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   return vld2_p16(a);
 }
 
-// CHECK-LABEL: @test_vld2_dup_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
-uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
-  return vld2_dup_u8(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
-uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
-  return vld2_dup_u16(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>
-uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
-  return vld2_dup_u32(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_u64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>
-uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
-  return vld2_dup_u64(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
-int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
-  return vld2_dup_s8(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
-int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
-  return vld2_dup_s16(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>
-int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
-  return vld2_dup_s32(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_s64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>
-int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
-  return vld2_dup_s64(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
-float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
-  return vld2_dup_f16(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>
-float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
-  return vld2_dup_f32(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>
-poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
-  return vld2_dup_p8(a);
-}
-
-// CHECK-LABEL: @test_vld2_dup_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
-poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
-  return vld2_dup_p16(a);
-}
-
 // CHECK-LABEL: @test_vld2q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
@@ -4846,7 +4741,7 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
@@ -4873,7 +4768,7 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
@@ -4900,7 +4795,7 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
@@ -4927,7 +4822,7 @@ int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
@@ -4954,7 +4849,7 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
@@ -4981,7 +4876,7 @@ float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
@@ -5008,7 +4903,7 @@ float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
@@ -5035,7 +4930,7 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5057,7 +4952,7 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5084,7 +4979,7 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
@@ -5111,7 +5006,7 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5133,7 +5028,7 @@ int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5160,7 +5055,7 @@ int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
@@ -5187,7 +5082,7 @@ int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5214,7 +5109,7 @@ float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
@@ -5241,7 +5136,7 @@ float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5263,7 +5158,7 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
@@ -5473,111 +5368,6 @@ poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   return vld3_p16(a);
 }
 
-// CHECK-LABEL: @test_vld3_dup_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
-uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
-  return vld3_dup_u8(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
-uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
-  return vld3_dup_u16(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
-uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
-  return vld3_dup_u32(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_u64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
-uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
-  return vld3_dup_u64(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
-int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
-  return vld3_dup_s8(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
-int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
-  return vld3_dup_s16(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
-int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
-  return vld3_dup_s32(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_s64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
-int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
-  return vld3_dup_s64(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
-float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
-  return vld3_dup_f16(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
-float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
-  return vld3_dup_f32(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
-poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
-  return vld3_dup_p8(a);
-}
-
-// CHECK-LABEL: @test_vld3_dup_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
-poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
-  return vld3_dup_p16(a);
-}
-
 // CHECK-LABEL: @test_vld3q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
@@ -5587,7 +5377,7 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5619,7 +5409,7 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
@@ -5651,7 +5441,7 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5683,7 +5473,7 @@ int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
@@ -5715,7 +5505,7 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5747,7 +5537,7 @@ float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
@@ -5779,7 +5569,7 @@ float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
@@ -5811,7 +5601,7 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5836,7 +5626,7 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
@@ -5868,7 +5658,7 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
@@ -5900,7 +5690,7 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -5925,7 +5715,7 @@ int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
@@ -5957,7 +5747,7 @@ int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
@@ -5989,7 +5779,7 @@ int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
@@ -6021,7 +5811,7 @@ float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
@@ -6053,7 +5843,7 @@ float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6078,7 +5868,7 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
@@ -6293,111 +6083,6 @@ poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   return vld4_p16(a);
 }
 
-// CHECK-LABEL: @test_vld4_dup_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
-uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
-  return vld4_dup_u8(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
-uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
-  return vld4_dup_u16(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
-uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
-  return vld4_dup_u32(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_u64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
-uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
-  return vld4_dup_u64(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
-int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
-  return vld4_dup_s8(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
-int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
-  return vld4_dup_s16(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
-int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
-  return vld4_dup_s32(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_s64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
-int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
-  return vld4_dup_s64(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
-float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
-  return vld4_dup_f16(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
-float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
-  return vld4_dup_f32(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
-poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
-  return vld4_dup_p8(a);
-}
-
-// CHECK-LABEL: @test_vld4_dup_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
-poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
-  return vld4_dup_p16(a);
-}
-
 // CHECK-LABEL: @test_vld4q_lane_u16(
 // CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
 // CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
@@ -6407,7 +6092,7 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6444,7 +6129,7 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
@@ -6481,7 +6166,7 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6518,7 +6203,7 @@ int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
@@ -6555,7 +6240,7 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6592,7 +6277,7 @@ float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
@@ -6629,7 +6314,7 @@ float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
@@ -6666,7 +6351,7 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6694,7 +6379,7 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
@@ -6731,7 +6416,7 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
@@ -6768,7 +6453,7 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6796,7 +6481,7 @@ int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
@@ -6833,7 +6518,7 @@ int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
@@ -6870,7 +6555,7 @@ int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
@@ -6907,7 +6592,7 @@ float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
@@ -6944,7 +6629,7 @@ float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
@@ -6972,7 +6657,7 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
@@ -16199,7 +15884,7 @@ void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16220,7 +15905,7 @@ void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16246,7 +15931,7 @@ void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16272,7 +15957,7 @@ void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16293,7 +15978,7 @@ void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16319,7 +16004,7 @@ void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16345,7 +16030,7 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -16371,7 +16056,7 @@ void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -16397,7 +16082,7 @@ void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -16418,7 +16103,7 @@ void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16444,7 +16129,7 @@ void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16465,7 +16150,7 @@ void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16491,7 +16176,7 @@ void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -16517,7 +16202,7 @@ void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -16543,7 +16228,7 @@ void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16564,7 +16249,7 @@ void test_vst2_s8(int8_t * a, int8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16590,7 +16275,7 @@ void test_vst2_s16(int16_t * a, int16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -16616,7 +16301,7 @@ void test_vst2_s32(int32_t * a, int32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -16642,7 +16327,7 @@ void test_vst2_s64(int64_t * a, int64x1x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -16668,7 +16353,7 @@ void test_vst2_f16(float16_t * a, float16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -16694,7 +16379,7 @@ void test_vst2_f32(float32_t * a, float32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16715,7 +16400,7 @@ void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16741,7 +16426,7 @@ void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16767,7 +16452,7 @@ void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16793,7 +16478,7 @@ void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16819,7 +16504,7 @@ void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -16845,7 +16530,7 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -16871,7 +16556,7 @@ void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -16897,7 +16582,7 @@ void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -16923,7 +16608,7 @@ void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -16944,7 +16629,7 @@ void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -16970,7 +16655,7 @@ void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -16996,7 +16681,7 @@ void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17017,7 +16702,7 @@ void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17043,7 +16728,7 @@ void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -17069,7 +16754,7 @@ void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -17095,7 +16780,7 @@ void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -17121,7 +16806,7 @@ void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17142,7 +16827,7 @@ void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
 // CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17168,7 +16853,7 @@ void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -17192,7 +16877,7 @@ void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17223,7 +16908,7 @@ void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17254,7 +16939,7 @@ void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -17278,7 +16963,7 @@ void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17309,7 +16994,7 @@ void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17340,7 +17025,7 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -17371,7 +17056,7 @@ void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -17402,7 +17087,7 @@ void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -17426,7 +17111,7 @@ void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17457,7 +17142,7 @@ void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17481,7 +17166,7 @@ void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17512,7 +17197,7 @@ void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -17543,7 +17228,7 @@ void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -17574,7 +17259,7 @@ void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17598,7 +17283,7 @@ void test_vst3_s8(int8_t * a, int8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17629,7 +17314,7 @@ void test_vst3_s16(int16_t * a, int16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -17660,7 +17345,7 @@ void test_vst3_s32(int32_t * a, int32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -17691,7 +17376,7 @@ void test_vst3_s64(int64_t * a, int64x1x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -17722,7 +17407,7 @@ void test_vst3_f16(float16_t * a, float16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -17753,7 +17438,7 @@ void test_vst3_f32(float32_t * a, float32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -17777,7 +17462,7 @@ void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -17808,7 +17493,7 @@ void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17839,7 +17524,7 @@ void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17870,7 +17555,7 @@ void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -17901,7 +17586,7 @@ void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -17932,7 +17617,7 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -17963,7 +17648,7 @@ void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -17994,7 +17679,7 @@ void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
 // CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18025,7 +17710,7 @@ void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18049,7 +17734,7 @@ void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18080,7 +17765,7 @@ void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18111,7 +17796,7 @@ void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18135,7 +17820,7 @@ void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18166,7 +17851,7 @@ void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18197,7 +17882,7 @@ void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -18228,7 +17913,7 @@ void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -18259,7 +17944,7 @@ void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18283,7 +17968,7 @@ void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
 // CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18314,7 +17999,7 @@ void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -18341,7 +18026,7 @@ void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18377,7 +18062,7 @@ void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -18413,7 +18098,7 @@ void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -18440,7 +18125,7 @@ void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18476,7 +18161,7 @@ void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -18512,7 +18197,7 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -18548,7 +18233,7 @@ void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -18584,7 +18269,7 @@ void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
@@ -18611,7 +18296,7 @@ void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -18647,7 +18332,7 @@ void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18674,7 +18359,7 @@ void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18710,7 +18395,7 @@ void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18746,7 +18431,7 @@ void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -18782,7 +18467,7 @@ void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -18809,7 +18494,7 @@ void test_vst4_s8(int8_t * a, int8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -18845,7 +18530,7 @@ void test_vst4_s16(int16_t * a, int16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -18881,7 +18566,7 @@ void test_vst4_s32(int32_t * a, int32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
@@ -18917,7 +18602,7 @@ void test_vst4_s64(int64_t * a, int64x1x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -18953,7 +18638,7 @@ void test_vst4_f16(float16_t * a, float16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -18989,7 +18674,7 @@ void test_vst4_f32(float32_t * a, float32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19016,7 +18701,7 @@ void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -19052,7 +18737,7 @@ void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -19088,7 +18773,7 @@ void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -19124,7 +18809,7 @@ void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -19160,7 +18845,7 @@ void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
@@ -19196,7 +18881,7 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
@@ -19232,7 +18917,7 @@ void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
@@ -19268,7 +18953,7 @@ void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
 // CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
@@ -19304,7 +18989,7 @@ void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19331,7 +19016,7 @@ void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -19367,7 +19052,7 @@ void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -19403,7 +19088,7 @@ void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19430,7 +19115,7 @@ void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -19466,7 +19151,7 @@ void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
@@ -19502,7 +19187,7 @@ void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
@@ -19538,7 +19223,7 @@ void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
@@ -19574,7 +19259,7 @@ void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
 // CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
@@ -19601,7 +19286,7 @@ void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
 // CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
@@ -20463,331 +20148,259 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtrn_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !3
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !3
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !3
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !3
 // CHECK:   ret void
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !6
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !6
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !6
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !6
 // CHECK:   ret void
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !9
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !alias.scope !9
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !9
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !alias.scope !9
 // CHECK:   ret void
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !12
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !12
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !12
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !12
 // CHECK:   ret void
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !15
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !15
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !15
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !15
 // CHECK:   ret void
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !noalias !18
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !alias.scope !18
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !18
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !alias.scope !18
 // CHECK:   ret void
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]], !noalias !21
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]], !alias.scope !21
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !noalias !21
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !alias.scope !21
 // CHECK:   ret void
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !noalias !24
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !24
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !24
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !24
 // CHECK:   ret void
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrn_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !noalias !27
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !27
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !27
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !27
 // CHECK:   ret void
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !30
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !30
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !30
 // CHECK:   ret void
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !33
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !33
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !33
 // CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !36
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !alias.scope !36
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !alias.scope !36
 // CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !39
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !39
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !39
 // CHECK:   ret void
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !42
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !42
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !42
 // CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !noalias !45
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !alias.scope !45
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !alias.scope !45
 // CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]], !noalias !48
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]], !alias.scope !48
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !alias.scope !48
 // CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !noalias !51
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !51
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !51
 // CHECK:   ret void
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vtrnq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !noalias !54
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !54
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !54
 // CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
@@ -20957,661 +20570,517 @@ uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   return vtstq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !57
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !57
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !57
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !57
 // CHECK:   ret void
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !60
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !60
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !60
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !60
 // CHECK:   ret void
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !63
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !alias.scope !63
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !63
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !alias.scope !63
 // CHECK:   ret void
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !66
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !66
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !66
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !66
 // CHECK:   ret void
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !69
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !69
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !69
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !69
 // CHECK:   ret void
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !noalias !72
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !alias.scope !72
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !72
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !alias.scope !72
 // CHECK:   ret void
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]], !noalias !75
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]], !alias.scope !75
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !noalias !75
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !alias.scope !75
 // CHECK:   ret void
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !noalias !78
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !78
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !78
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !78
 // CHECK:   ret void
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzp_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !noalias !81
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !81
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !81
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !81
 // CHECK:   ret void
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !84
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !84
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !84
 // CHECK:   ret void
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !87
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !87
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !87
 // CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !90
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !alias.scope !90
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !alias.scope !90
 // CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !93
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !93
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !93
 // CHECK:   ret void
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !96
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !96
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !96
 // CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !noalias !99
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !alias.scope !99
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !alias.scope !99
 // CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]], !noalias !102
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]], !alias.scope !102
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !alias.scope !102
 // CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !noalias !105
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !105
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !105
 // CHECK:   ret void
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vuzpq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !noalias !108
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !108
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !108
 // CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !111
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !111
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !111
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !111
 // CHECK:   ret void
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !114
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !114
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !114
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !114
 // CHECK:   ret void
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !117
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !alias.scope !117
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !117
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !alias.scope !117
 // CHECK:   ret void
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !120
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !120
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !120
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !120
 // CHECK:   ret void
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !123
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !123
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !123
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !123
 // CHECK:   ret void
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !noalias !126
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !alias.scope !126
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !126
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !alias.scope !126
 // CHECK:   ret void
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]], !noalias !129
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]], !alias.scope !129
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !noalias !129
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !alias.scope !129
 // CHECK:   ret void
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !noalias !132
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !132
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !132
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !132
 // CHECK:   ret void
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzip_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !noalias !135
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !135
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !135
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !135
 // CHECK:   ret void
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !138
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !138
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !138
 // CHECK:   ret void
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !141
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !141
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !141
 // CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !144
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !alias.scope !144
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !alias.scope !144
 // CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !147
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !147
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !147
 // CHECK:   ret void
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !150
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !150
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !150
 // CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !noalias !153
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !alias.scope !153
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !alias.scope !153
 // CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]], !noalias !156
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]], !alias.scope !156
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !alias.scope !156
 // CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !noalias !159
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !159
 // CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
-// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !159
 // CHECK:   ret void
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK: @test_vzipq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !noalias !162
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !162
 // CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
 // CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !162
 // CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
diff --git a/test/CodeGen/array-init.c b/test/CodeGen/array-init.c
new file mode 100644
index 000000000000..62e87edc2974
--- /dev/null
+++ b/test/CodeGen/array-init.c
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 %s -O0 -triple x86_64-unknown-linux-gnu -emit-llvm -o - | FileCheck -check-prefix=CHECK-NO-MERGE-CONSTANTS %s
+// RUN: %clang_cc1 %s -O0 -triple x86_64-unknown-linux-gnu -fmerge-all-constants -emit-llvm -o - | FileCheck -check-prefix=CHECK-MERGE-CONSTANTS %s
+
+// CHECK-NO-MERGE-CONSTANTS: @{{.*}}.a1 = private unnamed_addr constant [5 x i32] [i32 0, i32 1, i32 2, i32 0, i32 0]
+
+// CHECK-MERGE-CONSTANTS: @{{.*}}.a1 = internal constant [5 x i32] [i32 0, i32 1, i32 2, i32 0, i32 0]
+// CHECK-MERGE-CONSTANTS: @{{.*}}.a2 = internal constant [5 x i32] zeroinitializer
+// CHECK-MERGE-CONSTANTS: @{{.*}}.a3 = internal constant [5 x i32] zeroinitializer
+
+void testConstArrayInits(void)
+{
+  const int a1[5] = {0,1,2};
+  const int a2[5] = {0,0,0};
+  const int a3[5] = {0};
+}
diff --git a/test/CodeGen/artificial.c b/test/CodeGen/artificial.c
new file mode 100644
index 000000000000..5db6a6719454
--- /dev/null
+++ b/test/CodeGen/artificial.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
+
+extern void foo();
+// CHECK: !DISubprogram(name: "foo"
+// CHECK-SAME: flags: DIFlagArtificial
+inline void __attribute__((artificial)) foo() {}
+
+void baz() {
+  foo();
+}
diff --git a/test/CodeGen/asm-parser-info.S b/test/CodeGen/asm-parser-info.S
new file mode 100644
index 000000000000..62145c0a74d9
--- /dev/null
+++ b/test/CodeGen/asm-parser-info.S
@@ -0,0 +1,12 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang --target=x86_64-unknown-linux-gnu -c %s -o /dev/null
+
+// Check that cc1as can use assembler info in object generation.
+.data
+	
+foo:
+.if . - foo == 0
+    .byte 0xaa
+.else
+    .byte 0x00
+.endif
diff --git a/test/CodeGen/atomic-arm64.c b/test/CodeGen/atomic-arm64.c
index 5cae3d134984..0e79846085cf 100644
--- a/test/CodeGen/atomic-arm64.c
+++ b/test/CodeGen/atomic-arm64.c
@@ -65,7 +65,7 @@ void test3(pointer_pair_t pair) {
 // CHECK:      [[TEMP:%.*]] = alloca [[QUAD_T:%.*]], align 8
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i8*
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[QUAD_T]]* {{%.*}} to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 32, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T0]], i8* align 8 [[T1]], i64 32, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i256*
 // CHECK-NEXT: [[T1:%.*]] = bitcast i256* [[T0]] to i8*
 // CHECK-NEXT: call void @__atomic_store(i64 32, i8* bitcast ([[QUAD_T]]* @a_pointer_quad to i8*), i8* [[T1]], i32 5)
diff --git a/test/CodeGen/atomic-ops.c b/test/CodeGen/atomic-ops.c
index 1ebb2baa1ac6..500939f6fd75 100644
--- a/test/CodeGen/atomic-ops.c
+++ b/test/CodeGen/atomic-ops.c
@@ -183,11 +183,23 @@ struct S {
   double x;
 };
 
+void implicit_store(_Atomic(struct S) *a, struct S s) {
+  // CHECK-LABEL: @implicit_store(
+  // CHECK: store atomic i64 %{{.*}}, i64* %{{.*}} seq_cst, align 8
+  *a = s;
+}
+
+struct S implicit_load(_Atomic(struct S) *a) {
+  // CHECK-LABEL: @implicit_load(
+  // CHECK: load atomic i64, i64* %{{.*}} seq_cst, align 8
+  return *a;
+}
+
 struct S fd1(struct S *a) {
   // CHECK-LABEL: @fd1
   // CHECK: [[RETVAL:%.*]] = alloca %struct.S, align 4
-  // CHECK: [[RET:%.*]]    = alloca %struct.S, align 4
-  // CHECK: [[CAST:%.*]]   = bitcast %struct.S* [[RET]] to i64*
+  // CHECK: bitcast %struct.S* {{.*}} to i64*
+  // CHECK: [[CAST:%.*]]  = bitcast %struct.S* [[RETVAL]] to i64*
   // CHECK: [[CALL:%.*]]   = call i64 @__atomic_load_8(
   // CHECK: store i64 [[CALL]], i64* [[CAST]], align 4
   struct S ret;
diff --git a/test/CodeGen/atomics-sema-alignment.c b/test/CodeGen/atomics-sema-alignment.c
new file mode 100644
index 000000000000..364743dd53f0
--- /dev/null
+++ b/test/CodeGen/atomics-sema-alignment.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu %s -emit-llvm -o /dev/null -verify
+
+typedef struct {
+  int a, b;
+} IntPair;
+
+typedef struct {
+  long long a;
+} LongStruct;
+
+typedef int __attribute__((aligned(1))) unaligned_int;
+
+void func(IntPair *p) {
+  IntPair res;
+  __atomic_load(p, &res, 0); // expected-warning {{misaligned or large atomic operation may incur significant performance penalty}}
+  __atomic_store(p, &res, 0); // expected-warning {{misaligned or large atomic operation may incur significant performance penalty}}
+  __atomic_fetch_add((unaligned_int *)p, 1, 2); // expected-warning {{misaligned or large atomic operation may incur significant performance penalty}}
+  __atomic_fetch_sub((unaligned_int *)p, 1, 3); // expected-warning {{misaligned or large atomic operation may incur significant performance penalty}}
+}
+
+void func1(LongStruct *p) {
+  LongStruct res;
+  __atomic_load(p, &res, 0);
+  __atomic_store(p, &res, 0);
+  __atomic_fetch_add((int *)p, 1, 2);
+  __atomic_fetch_sub((int *)p, 1, 3);
+}
diff --git a/test/CodeGen/attr-cpuspecific.c b/test/CodeGen/attr-cpuspecific.c
new file mode 100644
index 000000000000..1b98b5dc9678
--- /dev/null
+++ b/test/CodeGen/attr-cpuspecific.c
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+
+// Each called version should have an IFunc.
+// CHECK: @SingleVersion.ifunc = ifunc void (), void ()* ()* @SingleVersion.resolver
+// CHECK: @TwoVersions.ifunc = ifunc void (), void ()* ()* @TwoVersions.resolver
+// CHECK: @TwoVersionsSameAttr.ifunc = ifunc void (), void ()* ()* @TwoVersionsSameAttr.resolver
+// CHECK: @ThreeVersionsSameAttr.ifunc = ifunc void (), void ()* ()* @ThreeVersionsSameAttr.resolver
+
+__attribute__((cpu_specific(ivybridge)))
+void SingleVersion(void){}
+// CHECK: define void @SingleVersion.S() #[[S:[0-9]+]]
+
+__attribute__((cpu_specific(ivybridge)))
+void NotCalled(void){}
+// CHECK: define void @NotCalled.S() #[[S]]
+
+// Done before any of the implementations.
+__attribute__((cpu_dispatch(ivybridge, knl)))
+void TwoVersions(void);
+// CHECK: define void ()* @TwoVersions.resolver()
+// CHECK: call void @__cpu_indicator_init
+// CHECK: ret void ()* @TwoVersions.Z
+// CHECK: ret void ()* @TwoVersions.S
+// CHECK: call void @llvm.trap
+// CHECK: unreachable
+
+__attribute__((cpu_specific(ivybridge)))
+void TwoVersions(void){}
+// CHECK: define void @TwoVersions.S() #[[S]]
+
+__attribute__((cpu_specific(knl)))
+void TwoVersions(void){}
+// CHECK: define void @TwoVersions.Z() #[[K:[0-9]+]]
+
+__attribute__((cpu_specific(ivybridge, knl)))
+void TwoVersionsSameAttr(void){}
+// CHECK: define void @TwoVersionsSameAttr.S() #[[S]]
+// CHECK: define void @TwoVersionsSameAttr.Z() #[[K]]
+
+__attribute__((cpu_specific(atom, ivybridge, knl)))
+void ThreeVersionsSameAttr(void){}
+// CHECK: define void @ThreeVersionsSameAttr.O() #[[O:[0-9]+]]
+// CHECK: define void @ThreeVersionsSameAttr.S() #[[S]]
+// CHECK: define void @ThreeVersionsSameAttr.Z() #[[K]]
+
+void usages() {
+  SingleVersion();
+  // CHECK: @SingleVersion.ifunc()
+  TwoVersions();
+  // CHECK: @TwoVersions.ifunc()
+  TwoVersionsSameAttr();
+  // CHECK: @TwoVersionsSameAttr.ifunc()
+  ThreeVersionsSameAttr();
+  // CHECK: @ThreeVersionsSameAttr.ifunc()
+}
+
+// has an extra config to emit!
+__attribute__((cpu_dispatch(ivybridge, knl, atom)))
+void TwoVersionsSameAttr(void);
+// CHECK: define void ()* @TwoVersionsSameAttr.resolver()
+// CHECK: ret void ()* @TwoVersionsSameAttr.Z
+// CHECK: ret void ()* @TwoVersionsSameAttr.S
+// CHECK: ret void ()* @TwoVersionsSameAttr.O
+// CHECK: call void @llvm.trap
+// CHECK: unreachable
+
+__attribute__((cpu_dispatch(atom, ivybridge, knl)))
+void ThreeVersionsSameAttr(void){}
+// CHECK: define void ()* @ThreeVersionsSameAttr.resolver()
+// CHECK: call void @__cpu_indicator_init
+// CHECK: ret void ()* @ThreeVersionsSameAttr.Z
+// CHECK: ret void ()* @ThreeVersionsSameAttr.S
+// CHECK: ret void ()* @ThreeVersionsSameAttr.O
+// CHECK: call void @llvm.trap
+// CHECK: unreachable
+
+// No Cpu Specific options.
+__attribute__((cpu_dispatch(atom, ivybridge, knl)))
+void NoSpecifics(void);
+// CHECK: define void ()* @NoSpecifics.resolver()
+// CHECK: call void @__cpu_indicator_init
+// CHECK: ret void ()* @NoSpecifics.Z
+// CHECK: ret void ()* @NoSpecifics.S
+// CHECK: ret void ()* @NoSpecifics.O
+// CHECK: call void @llvm.trap
+// CHECK: unreachable
+
+__attribute__((cpu_dispatch(atom, generic, ivybridge, knl)))
+void HasGeneric(void);
+// CHECK: define void ()* @HasGeneric.resolver()
+// CHECK: call void @__cpu_indicator_init
+// CHECK: ret void ()* @HasGeneric.Z
+// CHECK: ret void ()* @HasGeneric.S
+// CHECK: ret void ()* @HasGeneric.O
+// CHECK: ret void ()* @HasGeneric.A
+// CHECK-NOT: call void @llvm.trap
+
+// CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+f16c,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
+// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+cmov,+f16c,+fma,+lzcnt,+mmx,+movbe,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
+// CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+mmx,+movbe,+sse,+sse2,+sse3,+ssse3,+x87"
diff --git a/test/CodeGen/attr-target-mv-func-ptrs.c b/test/CodeGen/attr-target-mv-func-ptrs.c
new file mode 100644
index 000000000000..5df9a927cf8d
--- /dev/null
+++ b/test/CodeGen/attr-target-mv-func-ptrs.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+int __attribute__((target("sse4.2"))) foo(int i) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int);
+int __attribute__((target("arch=ivybridge"))) foo(int i) {return 1;}
+int __attribute__((target("default"))) foo(int i) { return 2; }
+
+typedef int (*FuncPtr)(int);
+void func(FuncPtr);
+
+int bar() {
+  func(foo);
+  FuncPtr Free = &foo;
+  FuncPtr Free2 = foo;
+
+  return 0;
+  return Free(1) + Free(2);
+}
+
+// CHECK: @foo.ifunc = ifunc i32 (i32), i32 (i32)* ()* @foo.resolver
+// CHECK: define i32 @foo.sse4.2(
+// CHECK: ret i32 0
+// CHECK: define i32 @foo.arch_ivybridge(
+// CHECK: ret i32 1
+// CHECK: define i32 @foo(
+// CHECK: ret i32 2
+
+// CHECK: define i32 @bar()
+// CHECK: call void @func(i32 (i32)* @foo.ifunc)
+// CHECK: store i32 (i32)* @foo.ifunc
+// CHECK: store i32 (i32)* @foo.ifunc
+
+// CHECK: declare i32 @foo.arch_sandybridge(
diff --git a/test/CodeGen/attr-target-mv-va-args.c b/test/CodeGen/attr-target-mv-va-args.c
new file mode 100644
index 000000000000..b33f841dba5e
--- /dev/null
+++ b/test/CodeGen/attr-target-mv-va-args.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+int __attribute__((target("sse4.2"))) foo(int i, ...) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(int i, ...);
+int __attribute__((target("arch=ivybridge"))) foo(int i, ...) {return 1;}
+int __attribute__((target("default"))) foo(int i, ...) { return 2; }
+
+int bar() {
+  return foo(1, 'a', 1.1) + foo(2, 2.2, "asdf");
+}
+
+// CHECK: @foo.ifunc = ifunc i32 (i32, ...), i32 (i32, ...)* ()* @foo.resolver
+// CHECK: define i32 @foo.sse4.2(i32 %i, ...)
+// CHECK: ret i32 0
+// CHECK: define i32 @foo.arch_ivybridge(i32 %i, ...)
+// CHECK: ret i32 1
+// CHECK: define i32 @foo(i32 %i, ...)
+// CHECK: ret i32 2
+// CHECK: define i32 @bar()
+// CHECK: call i32 (i32, ...) @foo.ifunc(i32 1, i32 97, double
+// CHECK: call i32 (i32, ...) @foo.ifunc(i32 2, double 2.2{{[0-9Ee+]+}}, i8* getelementptr inbounds 
+// CHECK: define i32 (i32, ...)* @foo.resolver() comdat
+// CHECK: ret i32 (i32, ...)* @foo.arch_sandybridge
+// CHECK: ret i32 (i32, ...)* @foo.arch_ivybridge
+// CHECK: ret i32 (i32, ...)* @foo.sse4.2
+// CHECK: ret i32 (i32, ...)* @foo
+// CHECK: declare i32 @foo.arch_sandybridge(i32, ...)
diff --git a/test/CodeGen/attr-target-mv.c b/test/CodeGen/attr-target-mv.c
new file mode 100644
index 000000000000..0085a154ced1
--- /dev/null
+++ b/test/CodeGen/attr-target-mv.c
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+int __attribute__((target("sse4.2"))) foo(void) { return 0; }
+int __attribute__((target("arch=sandybridge"))) foo(void);
+int __attribute__((target("arch=ivybridge"))) foo(void) {return 1;}
+int __attribute__((target("default"))) foo(void) { return 2; }
+
+int bar() {
+  return foo();
+}
+
+inline int __attribute__((target("sse4.2"))) foo_inline(void) { return 0; }
+inline int __attribute__((target("arch=sandybridge"))) foo_inline(void);
+inline int __attribute__((target("arch=ivybridge"))) foo_inline(void) {return 1;}
+inline int __attribute__((target("default"))) foo_inline(void) { return 2; }
+
+int bar2() {
+  return foo_inline();
+}
+
+inline __attribute__((target("default"))) void foo_decls(void);
+inline __attribute__((target("sse4.2"))) void foo_decls(void);
+void bar3() {
+  foo_decls();
+}
+inline __attribute__((target("default"))) void foo_decls(void) {}
+inline __attribute__((target("sse4.2"))) void foo_decls(void) {}
+
+inline __attribute__((target("default"))) void foo_multi(void) {}
+inline __attribute__((target("avx,sse4.2"))) void foo_multi(void) {}
+inline __attribute__((target("sse4.2,fma4"))) void foo_multi(void) {}
+inline __attribute__((target("arch=ivybridge,fma4,sse4.2"))) void foo_multi(void) {}
+void bar4() {
+  foo_multi();
+}
+
+// CHECK: @foo.ifunc = ifunc i32 (), i32 ()* ()* @foo.resolver
+// CHECK: @foo_inline.ifunc = ifunc i32 (), i32 ()* ()* @foo_inline.resolver
+// CHECK: @foo_decls.ifunc = ifunc void (), void ()* ()* @foo_decls.resolver
+
+// CHECK: define i32 @foo.sse4.2()
+// CHECK: ret i32 0
+// CHECK: define i32 @foo.arch_ivybridge()
+// CHECK: ret i32 1
+// CHECK: define i32 @foo()
+// CHECK: ret i32 2
+// CHECK: define i32 @bar()
+// CHECK: call i32 @foo.ifunc()
+
+// CHECK: define i32 ()* @foo.resolver() comdat
+// CHECK: call void @__cpu_indicator_init()
+// CHECK: ret i32 ()* @foo.arch_sandybridge
+// CHECK: ret i32 ()* @foo.arch_ivybridge
+// CHECK: ret i32 ()* @foo.sse4.2
+// CHECK: ret i32 ()* @foo
+
+// CHECK: define i32 @bar2()
+// CHECK: call i32 @foo_inline.ifunc()
+
+// CHECK: define i32 ()* @foo_inline.resolver() comdat
+// CHECK: call void @__cpu_indicator_init()
+// CHECK: ret i32 ()* @foo_inline.arch_sandybridge
+// CHECK: ret i32 ()* @foo_inline.arch_ivybridge
+// CHECK: ret i32 ()* @foo_inline.sse4.2
+// CHECK: ret i32 ()* @foo_inline
+
+// CHECK: define void @bar3()
+// CHECK: call void @foo_decls.ifunc()
+
+// CHECK: define void ()* @foo_decls.resolver() comdat
+// CHECK: ret void ()* @foo_decls.sse4.2
+// CHECK: ret void ()* @foo_decls
+
+// CHECK: declare i32 @foo.arch_sandybridge()
+
+// CHECK: define available_externally i32 @foo_inline.sse4.2()
+// CHECK: ret i32 0
+
+// CHECK: declare i32 @foo_inline.arch_sandybridge()
+//
+// CHECK: define available_externally i32 @foo_inline.arch_ivybridge()
+// CHECK: ret i32 1
+// CHECK: define available_externally i32 @foo_inline()
+// CHECK: ret i32 2
+
+// CHECK: define available_externally void @foo_decls()
+// CHECK: define available_externally void @foo_decls.sse4.2()
+
+// CHECK: define available_externally void @foo_multi.avx_sse4.2()
+// CHECK: define available_externally void @foo_multi.fma4_sse4.2()
+// CHECK: define available_externally void @foo_multi.arch_ivybridge_fma4_sse4.2()
+
diff --git a/test/CodeGen/attr-target-x86.c b/test/CodeGen/attr-target-x86.c
index 9e46de74916b..ca544e4eaefb 100644
--- a/test/CodeGen/attr-target-x86.c
+++ b/test/CodeGen/attr-target-x86.c
@@ -21,6 +21,17 @@ int __attribute__((target("no-mmx"))) qq(int a) { return 40; }
 
 int __attribute__((target("arch=lakemont,mmx"))) lake(int a) { return 4; }
 
+int use_before_def(void);
+int useage(void){
+  return use_before_def();
+}
+
+// Adding the attribute to a definition does update it in IR.
+int __attribute__((target("arch=lakemont,mmx"))) use_before_def(void) {
+  return 5;
+}
+
+
 // Check that we emit the additional subtarget and cpu features for foo and not for baz or bar.
 // CHECK: baz{{.*}} #0
 // CHECK: foo{{.*}} #1
@@ -36,11 +47,12 @@ int __attribute__((target("arch=lakemont,mmx"))) lake(int a) { return 4; }
 // CHECK: qax{{.*}} #5
 // CHECK: qq{{.*}} #6
 // CHECK: lake{{.*}} #7
+// CHECK: use_before_def{{.*}} #7
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+x87"
-// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
+// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+x87,-aes,-avx,-avx2,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-xop,-xsave,-xsaveopt"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87"
 // CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+x87,-avx,-avx2,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vpopcntdq,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop,-xsave,-xsaveopt"
-// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
+// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+x87,-3dnow,-3dnowa,-mmx"
 // CHECK: #7 = {{.*}}"target-cpu"="lakemont" "target-features"="+mmx"
diff --git a/test/CodeGen/attr-x86-interrupt.c b/test/CodeGen/attr-x86-interrupt.c
index 0aca1f5b9c9c..700a57524dca 100644
--- a/test/CodeGen/attr-x86-interrupt.c
+++ b/test/CodeGen/attr-x86-interrupt.c
@@ -23,12 +23,12 @@ __attribute__((interrupt)) void foo8(int *a) {}
 // X86_LINUX: "disable-tail-calls"="true"
 // X86_LINUX-NOT: "disable-tail-calls"="false"
 // X86_64_WIN: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i64)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
-// X86_64_WIN: define x86_intrcc void @foo7(i32* %{{.+}}, i64 %{{.+}})
-// X86_64_WIN: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_64_WIN: define dso_local x86_intrcc void @foo7(i32* %{{.+}}, i64 %{{.+}})
+// X86_64_WIN: define dso_local x86_intrcc void @foo8(i32* %{{.+}})
 // X86_64_Win: "disable-tail-calls"="true"
 // X86_64_Win-NOT: "disable-tail-calls"="false"
 // X86_WIN: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i32)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
-// X86_WIN: define x86_intrcc void @foo7(i32* %{{.+}}, i32 %{{.+}})
-// X86_WIN: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_WIN: define dso_local x86_intrcc void @foo7(i32* %{{.+}}, i32 %{{.+}})
+// X86_WIN: define dso_local x86_intrcc void @foo8(i32* %{{.+}})
 // X86_Win: "disable-tail-calls"="true"
 // X86_Win-NOT: "disable-tail-calls"="false"
diff --git a/test/CodeGen/attributes.c b/test/CodeGen/attributes.c
index 34833a246920..4be38b6367dc 100644
--- a/test/CodeGen/attributes.c
+++ b/test/CodeGen/attributes.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386-linux-gnu -o %t %s
+// RUN: %clang_cc1 -emit-llvm -fcf-protection=branch -triple i386-linux-gnu -o %t %s
 // RUN: FileCheck --input-file=%t %s
 
 // CHECK: @t5 = weak global i32 2
@@ -97,8 +97,20 @@ void __attribute__((section(".bar"))) t22(void) {}
 
 // CHECK: define void @t22() [[NUW]] section ".bar"
 
+// CHECK: define void @t23() [[NOCF_CHECK_FUNC:#[0-9]+]]
+void __attribute__((nocf_check)) t23(void) {}
+
+// CHECK: call void %{{[a-z0-9]+}}() [[NOCF_CHECK_CALL:#[0-9]+]]
+typedef void (*f_t)(void);
+void t24(f_t f1) {
+  __attribute__((nocf_check)) f_t p = f1;
+  (*p)();
+}
+
 // CHECK: attributes [[NUW]] = { noinline nounwind{{.*}} }
 // CHECK: attributes [[NR]] = { noinline noreturn nounwind{{.*}} }
 // CHECK: attributes [[COLDDEF]] = { cold {{.*}}}
 // CHECK: attributes [[COLDDECL]] = { cold {{.*}}}
+// CHECK: attributes [[NOCF_CHECK_FUNC]] = { nocf_check {{.*}}}
 // CHECK: attributes [[COLDSITE]] = { cold {{.*}}}
+// CHECK: attributes [[NOCF_CHECK_CALL]] = { nocf_check }
diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c
index 4e77ad166ce0..3e7709b1b7e9 100644
--- a/test/CodeGen/avx-builtins.c
+++ b/test/CodeGen/avx-builtins.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
 
@@ -59,7 +59,7 @@ __m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
 __m256d test_mm256_blend_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_blend_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm256_blend_pd(A, B, 0x35);
+  return _mm256_blend_pd(A, B, 0x05);
 }
 
 __m256 test_mm256_blend_ps(__m256 A, __m256 B) {
@@ -212,28 +212,772 @@ __m256 test_mm_ceil_ps(__m256 x) {
   return _mm256_ceil_ps(x);
 }
 
-__m128d test_mm_cmp_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmp_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 13)
-  return _mm_cmp_pd(A, B, _CMP_GE_OS);
+__m256d test_mm256_cmp_pd_eq_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: @test_mm256_cmp_pd_eq_oq
+  // CHECK: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
 }
 
-__m256d test_mm256_cmp_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_cmp_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 13)
-  return _mm256_cmp_pd(A, B, _CMP_GE_OS);
+__m256d test_mm256_cmp_pd_lt_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_lt_os
+  // CHECK: fcmp olt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_LT_OS);
 }
 
-__m128 test_mm_cmp_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_cmp_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 13)
-  return _mm_cmp_ps(A, B, _CMP_GE_OS);
+__m256d test_mm256_cmp_pd_le_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_le_os
+  // CHECK: fcmp ole <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_LE_OS);
 }
 
-__m256 test_mm256_cmp_ps(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_cmp_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 13)
-  return _mm256_cmp_ps(A, B, _CMP_GE_OS);
+__m256d test_mm256_cmp_pd_unord_q(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_unord_q
+  // CHECK: fcmp uno <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
+}
+
+__m256d test_mm256_cmp_pd_neq_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_neq_uq
+  // CHECK: fcmp une <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ);
+}
+
+__m256d test_mm256_cmp_pd_nlt_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_nlt_us
+  // CHECK: fcmp uge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NLT_US);
+}
+
+__m256d test_mm256_cmp_pd_nle_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_nle_us
+  // CHECK: fcmp ugt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NLE_US);
+}
+
+__m256d test_mm256_cmp_pd_ord_q(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_ord_q
+  // CHECK: fcmp ord <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_ORD_Q);
+}
+
+__m256d test_mm256_cmp_pd_eq_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_eq_uq
+  // CHECK: fcmp ueq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_EQ_UQ);
+}
+
+__m256d test_mm256_cmp_pd_nge_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_nge_us
+  // CHECK: fcmp ult <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NGE_US);
+}
+
+__m256d test_mm256_cmp_pd_ngt_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_ngt_us
+  // CHECK: fcmp ule <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NGT_US);
+}
+
+__m256d test_mm256_cmp_pd_false_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_false_oq
+  // CHECK: fcmp false <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_FALSE_OQ);
+}
+
+__m256d test_mm256_cmp_pd_neq_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_neq_oq
+  // CHECK: fcmp one <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NEQ_OQ);
+}
+
+__m256d test_mm256_cmp_pd_ge_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_ge_os
+  // CHECK: fcmp oge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_GE_OS);
+}
+
+__m256d test_mm256_cmp_pd_gt_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_gt_os
+  // CHECK: fcmp ogt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_GT_OS);
+}
+
+__m256d test_mm256_cmp_pd_true_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_true_uq
+  // CHECK: fcmp true <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_TRUE_UQ);
+}
+
+__m256d test_mm256_cmp_pd_eq_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_eq_os
+  // CHECK: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_EQ_OS);
+}
+
+__m256d test_mm256_cmp_pd_lt_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_lt_oq
+  // CHECK: fcmp olt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
+}
+
+__m256d test_mm256_cmp_pd_le_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_le_oq
+  // CHECK: fcmp ole <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
+}
+
+__m256d test_mm256_cmp_pd_unord_s(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_unord_s
+  // CHECK: fcmp uno <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_UNORD_S);
+}
+
+__m256d test_mm256_cmp_pd_neq_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_neq_us
+  // CHECK: fcmp une <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NEQ_US);
+}
+
+__m256d test_mm256_cmp_pd_nlt_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_nlt_uq
+  // CHECK: fcmp uge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NLT_UQ);
+}
+
+__m256d test_mm256_cmp_pd_nle_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_nle_uq
+  // CHECK: fcmp ugt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NLE_UQ);
+}
+
+__m256d test_mm256_cmp_pd_ord_s(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_ord_s
+  // CHECK: fcmp ord <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_ORD_S);
+}
+
+__m256d test_mm256_cmp_pd_eq_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_eq_us
+  // CHECK: fcmp ueq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_EQ_US);
+}
+
+__m256d test_mm256_cmp_pd_nge_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_nge_uq
+  // CHECK: fcmp ult <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NGE_UQ);
+}
+
+__m256d test_mm256_cmp_pd_ngt_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_ngt_uq
+  // CHECK: fcmp ule <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NGT_UQ);
+}
+
+__m256d test_mm256_cmp_pd_false_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_false_os
+  // CHECK: fcmp false <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_FALSE_OS);
+}
+
+__m256d test_mm256_cmp_pd_neq_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_neq_os
+  // CHECK: fcmp one <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_NEQ_OS);
+}
+
+__m256d test_mm256_cmp_pd_ge_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_ge_oq
+  // CHECK: fcmp oge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_GE_OQ);
+}
+
+__m256d test_mm256_cmp_pd_gt_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_gt_oq
+  // CHECK: fcmp ogt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_GT_OQ);
+}
+
+__m256d test_mm256_cmp_pd_true_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_true_us
+  // CHECK: fcmp true <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd(a, b, _CMP_TRUE_US);
+}
+
+__m256 test_mm256_cmp_ps_eq_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: @test_mm256_cmp_ps_eq_oq
+  // CHECK: fcmp oeq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+}
+
+__m256 test_mm256_cmp_ps_lt_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_lt_os
+  // CHECK: fcmp olt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_LT_OS);
+}
+
+__m256 test_mm256_cmp_ps_le_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_le_os
+  // CHECK: fcmp ole <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_LE_OS);
+}
+
+__m256 test_mm256_cmp_ps_unord_q(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_unord_q
+  // CHECK: fcmp uno <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
+}
+
+__m256 test_mm256_cmp_ps_neq_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_neq_uq
+  // CHECK: fcmp une <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ);
+}
+
+__m256 test_mm256_cmp_ps_nlt_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_nlt_us
+  // CHECK: fcmp uge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NLT_US);
+}
+
+__m256 test_mm256_cmp_ps_nle_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_nle_us
+  // CHECK: fcmp ugt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NLE_US);
+}
+
+__m256 test_mm256_cmp_ps_ord_q(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_ord_q
+  // CHECK: fcmp ord <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_ORD_Q);
+}
+
+__m256 test_mm256_cmp_ps_eq_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_eq_uq
+  // CHECK: fcmp ueq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_EQ_UQ);
+}
+
+__m256 test_mm256_cmp_ps_nge_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_nge_us
+  // CHECK: fcmp ult <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NGE_US);
+}
+
+__m256 test_mm256_cmp_ps_ngt_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_ngt_us
+  // CHECK: fcmp ule <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NGT_US);
+}
+
+__m256 test_mm256_cmp_ps_false_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_false_oq
+  // CHECK: fcmp false <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_FALSE_OQ);
+}
+
+__m256 test_mm256_cmp_ps_neq_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_neq_oq
+  // CHECK: fcmp one <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
+}
+
+__m256 test_mm256_cmp_ps_ge_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_ge_os
+  // CHECK: fcmp oge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_GE_OS);
+}
+
+__m256 test_mm256_cmp_ps_gt_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_gt_os
+  // CHECK: fcmp ogt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_GT_OS);
+}
+
+__m256 test_mm256_cmp_ps_true_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_true_uq
+  // CHECK: fcmp true <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_TRUE_UQ);
+}
+
+__m256 test_mm256_cmp_ps_eq_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_eq_os
+  // CHECK: fcmp oeq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_EQ_OS);
+}
+
+__m256 test_mm256_cmp_ps_lt_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_lt_oq
+  // CHECK: fcmp olt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
+}
+
+__m256 test_mm256_cmp_ps_le_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_le_oq
+  // CHECK: fcmp ole <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
+}
+
+__m256 test_mm256_cmp_ps_unord_s(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_unord_s
+  // CHECK: fcmp uno <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_UNORD_S);
+}
+
+__m256 test_mm256_cmp_ps_neq_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_neq_us
+  // CHECK: fcmp une <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NEQ_US);
+}
+
+__m256 test_mm256_cmp_ps_nlt_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_nlt_uq
+  // CHECK: fcmp uge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NLT_UQ);
+}
+
+__m256 test_mm256_cmp_ps_nle_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_nle_uq
+  // CHECK: fcmp ugt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NLE_UQ);
+}
+
+__m256 test_mm256_cmp_ps_ord_s(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_ord_s
+  // CHECK: fcmp ord <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_ORD_S);
+}
+
+__m256 test_mm256_cmp_ps_eq_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_eq_us
+  // CHECK: fcmp ueq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_EQ_US);
+}
+
+__m256 test_mm256_cmp_ps_nge_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_nge_uq
+  // CHECK: fcmp ult <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NGE_UQ);
+}
+
+__m256 test_mm256_cmp_ps_ngt_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_ngt_uq
+  // CHECK: fcmp ule <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NGT_UQ);
+}
+
+__m256 test_mm256_cmp_ps_false_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_false_os
+  // CHECK: fcmp false <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_FALSE_OS);
+}
+
+__m256 test_mm256_cmp_ps_neq_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_neq_os
+  // CHECK: fcmp one <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_NEQ_OS);
+}
+
+__m256 test_mm256_cmp_ps_ge_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_ge_oq
+  // CHECK: fcmp oge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_GE_OQ);
+}
+
+__m256 test_mm256_cmp_ps_gt_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_gt_oq
+  // CHECK: fcmp ogt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
+}
+
+__m256 test_mm256_cmp_ps_true_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_true_us
+  // CHECK: fcmp true <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps(a, b, _CMP_TRUE_US);
+}
+
+__m128d test_mm_cmp_pd_eq_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: @test_mm_cmp_pd_eq_oq
+  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_EQ_OQ);
+}
+
+__m128d test_mm_cmp_pd_lt_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_lt_os
+  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_LT_OS);
+}
+
+__m128d test_mm_cmp_pd_le_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_le_os
+  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_LE_OS);
+}
+
+__m128d test_mm_cmp_pd_unord_q(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_unord_q
+  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_UNORD_Q);
+}
+
+__m128d test_mm_cmp_pd_neq_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_neq_uq
+  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NEQ_UQ);
+}
+
+__m128d test_mm_cmp_pd_nlt_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nlt_us
+  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NLT_US);
+}
+
+__m128d test_mm_cmp_pd_nle_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nle_us
+  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NLE_US);
+}
+
+__m128d test_mm_cmp_pd_ord_q(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_ord_q
+  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_ORD_Q);
+}
+
+__m128d test_mm_cmp_pd_eq_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_eq_uq
+  // CHECK: fcmp ueq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_EQ_UQ);
+}
+
+__m128d test_mm_cmp_pd_nge_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nge_us
+  // CHECK: fcmp ult <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NGE_US);
+}
+
+__m128d test_mm_cmp_pd_ngt_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_ngt_us
+  // CHECK: fcmp ule <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NGT_US);
+}
+
+__m128d test_mm_cmp_pd_false_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_false_oq
+  // CHECK: fcmp false <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_FALSE_OQ);
+}
+
+__m128d test_mm_cmp_pd_neq_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_neq_oq
+  // CHECK: fcmp one <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NEQ_OQ);
+}
+
+__m128d test_mm_cmp_pd_ge_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_ge_os
+  // CHECK: fcmp oge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_GE_OS);
+}
+
+__m128d test_mm_cmp_pd_gt_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_gt_os
+  // CHECK: fcmp ogt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_GT_OS);
+}
+
+__m128d test_mm_cmp_pd_true_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_true_uq
+  // CHECK: fcmp true <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_TRUE_UQ);
+}
+
+__m128d test_mm_cmp_pd_eq_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_eq_os
+  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_EQ_OS);
+}
+
+__m128d test_mm_cmp_pd_lt_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_lt_oq
+  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_LT_OQ);
+}
+
+__m128d test_mm_cmp_pd_le_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_le_oq
+  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_LE_OQ);
+}
+
+__m128d test_mm_cmp_pd_unord_s(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_unord_s
+  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_UNORD_S);
+}
+
+__m128d test_mm_cmp_pd_neq_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_neq_us
+  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NEQ_US);
+}
+
+__m128d test_mm_cmp_pd_nlt_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nlt_uq
+  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NLT_UQ);
+}
+
+__m128d test_mm_cmp_pd_nle_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nle_uq
+  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NLE_UQ);
+}
+
+__m128d test_mm_cmp_pd_ord_s(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_ord_s
+  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_ORD_S);
+}
+
+__m128d test_mm_cmp_pd_eq_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_eq_us
+  // CHECK: fcmp ueq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_EQ_US);
+}
+
+__m128d test_mm_cmp_pd_nge_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_nge_uq
+  // CHECK: fcmp ult <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NGE_UQ);
+}
+
+__m128d test_mm_cmp_pd_ngt_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_ngt_uq
+  // CHECK: fcmp ule <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NGT_UQ);
+}
+
+__m128d test_mm_cmp_pd_false_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_false_os
+  // CHECK: fcmp false <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_FALSE_OS);
+}
+
+__m128d test_mm_cmp_pd_neq_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_neq_os
+  // CHECK: fcmp one <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_NEQ_OS);
+}
+
+__m128d test_mm_cmp_pd_ge_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_ge_oq
+  // CHECK: fcmp oge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_GE_OQ);
+}
+
+__m128d test_mm_cmp_pd_gt_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_gt_oq
+  // CHECK: fcmp ogt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_GT_OQ);
+}
+
+__m128d test_mm_cmp_pd_true_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_true_us
+  // CHECK: fcmp true <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd(a, b, _CMP_TRUE_US);
+}
+
+__m128 test_mm_cmp_ps_eq_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: @test_mm_cmp_ps_eq_oq
+  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_EQ_OQ);
+}
+
+__m128 test_mm_cmp_ps_lt_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_lt_os
+  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_LT_OS);
+}
+
+__m128 test_mm_cmp_ps_le_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_le_os
+  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_LE_OS);
+}
+
+__m128 test_mm_cmp_ps_unord_q(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_unord_q
+  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_UNORD_Q);
+}
+
+__m128 test_mm_cmp_ps_neq_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_neq_uq
+  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NEQ_UQ);
+}
+
+__m128 test_mm_cmp_ps_nlt_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nlt_us
+  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NLT_US);
+}
+
+__m128 test_mm_cmp_ps_nle_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nle_us
+  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NLE_US);
+}
+
+__m128 test_mm_cmp_ps_ord_q(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_ord_q
+  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_ORD_Q);
+}
+
+__m128 test_mm_cmp_ps_eq_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_eq_uq
+  // CHECK: fcmp ueq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_EQ_UQ);
+}
+
+__m128 test_mm_cmp_ps_nge_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nge_us
+  // CHECK: fcmp ult <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NGE_US);
+}
+
+__m128 test_mm_cmp_ps_ngt_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_ngt_us
+  // CHECK: fcmp ule <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NGT_US);
+}
+
+__m128 test_mm_cmp_ps_false_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_false_oq
+  // CHECK: fcmp false <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_FALSE_OQ);
+}
+
+__m128 test_mm_cmp_ps_neq_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_neq_oq
+  // CHECK: fcmp one <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NEQ_OQ);
+}
+
+__m128 test_mm_cmp_ps_ge_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_ge_os
+  // CHECK: fcmp oge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_GE_OS);
+}
+
+__m128 test_mm_cmp_ps_gt_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_gt_os
+  // CHECK: fcmp ogt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_GT_OS);
+}
+
+__m128 test_mm_cmp_ps_true_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_true_uq
+  // CHECK: fcmp true <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_TRUE_UQ);
+}
+
+__m128 test_mm_cmp_ps_eq_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_eq_os
+  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_EQ_OS);
+}
+
+__m128 test_mm_cmp_ps_lt_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_lt_oq
+  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_LT_OQ);
+}
+
+__m128 test_mm_cmp_ps_le_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_le_oq
+  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_LE_OQ);
+}
+
+__m128 test_mm_cmp_ps_unord_s(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_unord_s
+  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_UNORD_S);
+}
+
+__m128 test_mm_cmp_ps_neq_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_neq_us
+  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NEQ_US);
+}
+
+__m128 test_mm_cmp_ps_nlt_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nlt_uq
+  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NLT_UQ);
+}
+
+__m128 test_mm_cmp_ps_nle_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nle_uq
+  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NLE_UQ);
+}
+
+__m128 test_mm_cmp_ps_ord_s(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_ord_s
+  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_ORD_S);
+}
+
+__m128 test_mm_cmp_ps_eq_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_eq_us
+  // CHECK: fcmp ueq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_EQ_US);
+}
+
+__m128 test_mm_cmp_ps_nge_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_nge_uq
+  // CHECK: fcmp ult <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NGE_UQ);
+}
+
+__m128 test_mm_cmp_ps_ngt_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_ngt_uq
+  // CHECK: fcmp ule <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NGT_UQ);
+}
+
+__m128 test_mm_cmp_ps_false_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_false_os
+  // CHECK: fcmp false <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_FALSE_OS);
+}
+
+__m128 test_mm_cmp_ps_neq_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_neq_os
+  // CHECK: fcmp one <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_NEQ_OS);
+}
+
+__m128 test_mm_cmp_ps_ge_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_ge_oq
+  // CHECK: fcmp oge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_GE_OQ);
+}
+
+__m128 test_mm_cmp_ps_gt_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_gt_oq
+  // CHECK: fcmp ogt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_GT_OQ);
+}
+
+__m128 test_mm_cmp_ps_true_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_true_us
+  // CHECK: fcmp true <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps(a, b, _CMP_TRUE_US);
 }
 
 __m128d test_mm_cmp_sd(__m128d A, __m128d B) {
@@ -256,7 +1000,7 @@ __m256d test_mm256_cvtepi32_pd(__m128i A) {
 
 __m256 test_mm256_cvtepi32_ps(__m256i A) {
   // CHECK-LABEL: test_mm256_cvtepi32_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %{{.*}})
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x float>
   return _mm256_cvtepi32_ps(A);
 }
 
@@ -316,49 +1060,45 @@ __m256 test_mm256_dp_ps(__m256 A, __m256 B) {
 
 int test_mm256_extract_epi8(__m256i A) {
   // CHECK-LABEL: test_mm256_extract_epi8
-  // CHECK: and i32 %{{.*}}, 31
-  // CHECK: extractelement <32 x i8> %{{.*}}, i32 %{{.*}}
+  // CHECK: extractelement <32 x i8> %{{.*}}, {{i32|i64}} 31
   // CHECK: zext i8 %{{.*}} to i32
-  return _mm256_extract_epi8(A, 32);
+  return _mm256_extract_epi8(A, 31);
 }
 
 int test_mm256_extract_epi16(__m256i A) {
   // CHECK-LABEL: test_mm256_extract_epi16
-  // CHECK: and i32 %{{.*}}, 15
-  // CHECK: extractelement <16 x i16> %{{.*}}, i32 %{{.*}}
+  // CHECK: extractelement <16 x i16> %{{.*}}, {{i32|i64}} 15
   // CHECK: zext i16 %{{.*}} to i32
-  return _mm256_extract_epi16(A, 16);
+  return _mm256_extract_epi16(A, 15);
 }
 
 int test_mm256_extract_epi32(__m256i A) {
   // CHECK-LABEL: test_mm256_extract_epi32
-  // CHECK: and i32 %{{.*}}, 7
-  // CHECK: extractelement <8 x i32> %{{.*}}, i32 %{{.*}}
-  return _mm256_extract_epi32(A, 8);
+  // CHECK: extractelement <8 x i32> %{{.*}}, {{i32|i64}} 7
+  return _mm256_extract_epi32(A, 7);
 }
 
 long long test_mm256_extract_epi64(__m256i A) {
   // CHECK-LABEL: test_mm256_extract_epi64
-  // CHECK: and i32 %{{.*}}, 3
-  // CHECK: extractelement <4 x i64> %{{.*}}, i32 %{{.*}}
-  return _mm256_extract_epi64(A, 5);
+  // CHECK: extractelement <4 x i64> %{{.*}}, {{i32|i64}} 3
+  return _mm256_extract_epi64(A, 3);
 }
 
 __m128d test_mm256_extractf128_pd(__m256d A) {
   // CHECK-LABEL: test_mm256_extractf128_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf128_pd(A, 1);
 }
 
 __m128 test_mm256_extractf128_ps(__m256 A) {
   // CHECK-LABEL: test_mm256_extractf128_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_ps(A, 1);
 }
 
 __m128i test_mm256_extractf128_si256(__m256i A) {
   // CHECK-LABEL: test_mm256_extractf128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_si256(A, 1);
 }
 
@@ -400,50 +1140,46 @@ __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
-  // CHECK: and i32 %{{.*}}, 31
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 %{{.*}}
-  return _mm256_insert_epi8(x, b, 17);
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, {{i32|i64}} 14
+  return _mm256_insert_epi8(x, b, 14);
 }
 
 __m256i test_mm256_insert_epi16(__m256i x, int b) {
   // CHECK-LABEL: test_mm256_insert_epi16
-  // CHECK: and i32 %{{.*}}, 15
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 %{{.*}}
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, {{i32|i64}} 4
   return _mm256_insert_epi16(x, b, 4);
 }
 
 __m256i test_mm256_insert_epi32(__m256i x, int b) {
   // CHECK-LABEL: test_mm256_insert_epi32
-  // CHECK: and i32 %{{.*}}, 7
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, {{i32|i64}} 5
   return _mm256_insert_epi32(x, b, 5);
 }
 
 __m256i test_mm256_insert_epi64(__m256i x, long long b) {
   // CHECK-LABEL: test_mm256_insert_epi64
-  // CHECK: and i32 %{{.*}}, 3
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 %{{.*}}
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, {{i32|i64}} 2
   return _mm256_insert_epi64(x, b, 2);
 }
 
 __m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
   // CHECK-LABEL: test_mm256_insertf128_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_insertf128_pd(A, B, 0);
 }
 
 __m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
   // CHECK-LABEL: test_mm256_insertf128_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf128_ps(A, B, 1);
 }
 
 __m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
   // CHECK-LABEL: test_mm256_insertf128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   return _mm256_insertf128_si256(A, B, 0);
 }
 
@@ -494,7 +1230,7 @@ __m256 test_mm256_loadu2_m128(float* A, float* B) {
   // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_loadu2_m128(A, B);
 }
@@ -504,7 +1240,7 @@ __m256d test_mm256_loadu2_m128d(double* A, double* B) {
   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_loadu2_m128d(A, B);
 }
@@ -514,8 +1250,8 @@ __m256i test_mm256_loadu2_m128i(__m128i* A, __m128i* B) {
   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_loadu2_m128i(A, B);
 }
 
@@ -647,32 +1383,32 @@ __m256 test_mm256_or_ps(__m256 A, __m256 B) {
 
 __m128d test_mm_permute_pd(__m128d A) {
   // CHECK-LABEL: test_mm_permute_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 0>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   return _mm_permute_pd(A, 1);
 }
 
 __m256d test_mm256_permute_pd(__m256d A) {
   // CHECK-LABEL: test_mm256_permute_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   return _mm256_permute_pd(A, 5);
 }
 
 __m128 test_mm_permute_ps(__m128 A) {
   // CHECK-LABEL: test_mm_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   return _mm_permute_ps(A, 0x1b);
 }
 
 // Test case for PR12401
 __m128 test2_mm_permute_ps(__m128 a) {
   // CHECK-LABEL: test2_mm_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
   return _mm_permute_ps(a, 0xe6);
 }
 
 __m256 test_mm256_permute_ps(__m256 A) {
   // CHECK-LABEL: test_mm256_permute_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   return _mm256_permute_ps(A, 0x1b);
 }
 
@@ -834,13 +1570,13 @@ __m256 test_mm256_set_m128(__m128 A, __m128 B) {
 
 __m256d test_mm256_set_m128d(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm256_set_m128d
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_set_m128d(A, B);
 }
 
 __m256i test_mm256_set_m128i(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm256_set_m128i
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_set_m128i(A, B);
 }
 
@@ -1060,13 +1796,13 @@ __m256 test_mm256_setr_m128(__m128 A, __m128 B) {
 
 __m256d test_mm256_setr_m128d(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm256_setr_m128d
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128d(A, B);
 }
 
 __m256i test_mm256_setr_m128i(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm256_setr_m128i
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128i(A, B);
 }
 
@@ -1124,13 +1860,13 @@ __m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
 
 __m256d test_mm256_sqrt_pd(__m256d A) {
   // CHECK-LABEL: test_mm256_sqrt_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %{{.*}})
+  // CHECK: call <4 x double> @llvm.sqrt.v4f64(<4 x double> %{{.*}})
   return _mm256_sqrt_pd(A);
 }
 
 __m256 test_mm256_sqrt_ps(__m256 A) {
   // CHECK-LABEL: test_mm256_sqrt_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %{{.*}})
+  // CHECK: call <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.*}})
   return _mm256_sqrt_ps(A);
 }
 
@@ -1177,7 +1913,7 @@ void test_mm256_storeu2_m128(float* A, float* B, __m256 C) {
   // CHECK-LABEL: test_mm256_storeu2_m128
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128(A, B, C);
 }
@@ -1186,7 +1922,7 @@ void test_mm256_storeu2_m128d(double* A, double* B, __m256d C) {
   // CHECK-LABEL: test_mm256_storeu2_m128d
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128d(A, B, C);
 }
@@ -1195,7 +1931,7 @@ void test_mm256_storeu2_m128i(__m128i* A, __m128i* B, __m256i C) {
   // CHECK-LABEL: test_mm256_storeu2_m128i
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128i(A, B, C);
 }
@@ -1254,7 +1990,7 @@ int test_mm256_testc_ps(__m256 A, __m256 B) {
   return _mm256_testc_ps(A, B);
 }
 
-int test_mm256_testc_si256(__m256 A, __m256 B) {
+int test_mm256_testc_si256(__m256i A, __m256i B) {
   // CHECK-LABEL: test_mm256_testc_si256
   // CHECK: call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_testc_si256(A, B);
@@ -1284,7 +2020,7 @@ int test_mm256_testnzc_ps(__m256 A, __m256 B) {
   return _mm256_testnzc_ps(A, B);
 }
 
-int test_mm256_testnzc_si256(__m256 A, __m256 B) {
+int test_mm256_testnzc_si256(__m256i A, __m256i B) {
   // CHECK-LABEL: test_mm256_testnzc_si256
   // CHECK: call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_testnzc_si256(A, B);
@@ -1314,7 +2050,7 @@ int test_mm256_testz_ps(__m256 A, __m256 B) {
   return _mm256_testz_ps(A, B);
 }
 
-int test_mm256_testz_si256(__m256 A, __m256 B) {
+int test_mm256_testz_si256(__m256i A, __m256i B) {
   // CHECK-LABEL: test_mm256_testz_si256
   // CHECK: call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_testz_si256(A, B);
@@ -1409,69 +2145,21 @@ __m256i test_mm256_zextsi128_si256(__m128i A) {
 
 double test_mm256_cvtsd_f64(__m256d __a)
 {
- // CHECK-LABEL: @test_mm256_cvtsd_f64
- // CHECK: extractelement <4 x double> %{{.*}}, i32 0
- return _mm256_cvtsd_f64(__a);
+  // CHECK-LABEL: @test_mm256_cvtsd_f64
+  // CHECK: extractelement <4 x double> %{{.*}}, i32 0
+  return _mm256_cvtsd_f64(__a);
 }
 
 int test_mm256_cvtsi256_si32(__m256i __a)
 {
- // CHECK-LABEL: @test_mm256_cvtsi256_si32
- // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
- return _mm256_cvtsi256_si32(__a);
+  // CHECK-LABEL: @test_mm256_cvtsi256_si32
+  // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
+  return _mm256_cvtsi256_si32(__a);
 }
 
 float test_mm256_cvtss_f32(__m256 __a)
 {
- // CHECK-LABEL: @test_mm256_cvtss_f32
- // CHECK: extractelement <8 x float> %{{.*}}, i32 0
- return _mm256_cvtss_f32(__a);
-}
-
-__m256 test_mm256_cmp_ps_true(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_ps_true
- // CHECK: store <8 x float> <float 0xFFFFFFFFE0000000,
- return _mm256_cmp_ps(a, b, _CMP_TRUE_UQ);
-}
-
-__m256 test_mm256_cmp_pd_true(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_pd_true
- // CHECK: store <4 x double> <double 0xFFFFFFFFFFFFFFFF,
-  return _mm256_cmp_pd(a, b, _CMP_TRUE_UQ);
-}
-
-__m256 test_mm256_cmp_ps_false(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_ps_false
- // CHECK: store <8 x float> zeroinitializer, <8 x float>* %tmp, align 32
- return _mm256_cmp_ps(a, b, _CMP_FALSE_OQ);
-}
-
-__m256 test_mm256_cmp_pd_false(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_pd_false
- // CHECK: store <4 x double> zeroinitializer, <4 x double>* %tmp, align 32
-  return _mm256_cmp_pd(a, b, _CMP_FALSE_OQ);
-}
-
-__m256 test_mm256_cmp_ps_strue(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_ps_strue
- // CHECK: store <8 x float> <float 0xFFFFFFFFE0000000,
- return _mm256_cmp_ps(a, b, _CMP_TRUE_US);
-}
-
-__m256 test_mm256_cmp_pd_strue(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_pd_strue
- // CHECK: store <4 x double> <double 0xFFFFFFFFFFFFFFFF,
-  return _mm256_cmp_pd(a, b, _CMP_TRUE_US);
-}
-
-__m256 test_mm256_cmp_ps_sfalse(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_ps_sfalse
- // CHECK: store <8 x float> zeroinitializer, <8 x float>* %tmp, align 32
- return _mm256_cmp_ps(a, b, _CMP_FALSE_OS);
-}
-
-__m256 test_mm256_cmp_pd_sfalse(__m256 a, __m256 b) {
- // CHECK-LABEL: @test_mm256_cmp_pd_sfalse
- // CHECK: store <4 x double> zeroinitializer, <4 x double>* %tmp, align 32
-  return _mm256_cmp_pd(a, b, _CMP_FALSE_OS);
+  // CHECK-LABEL: @test_mm256_cvtss_f32
+  // CHECK: extractelement <8 x float> %{{.*}}, i32 0
+  return _mm256_cvtss_f32(__a);
 }
diff --git a/test/CodeGen/avx-cmp-builtins.c b/test/CodeGen/avx-cmp-builtins.c
index 7d619426653d..38d3ae24cdf1 100644
--- a/test/CodeGen/avx-cmp-builtins.c
+++ b/test/CodeGen/avx-cmp-builtins.c
@@ -8,30 +8,6 @@
 // Test LLVM IR codegen of cmpXY instructions
 //
 
-__m128d test_cmp_pd(__m128d a, __m128d b) {
-  // Expects that the third argument in LLVM IR is immediate expression
-  // CHECK: @llvm.x86.sse2.cmp.pd({{.*}}, i8 13)
-  return _mm_cmp_pd(a, b, _CMP_GE_OS);
-}
-
-__m128d test_cmp_ps(__m128 a, __m128 b) {
-  // Expects that the third argument in LLVM IR is immediate expression
-  // CHECK: @llvm.x86.sse.cmp.ps({{.*}}, i8 13)
-  return _mm_cmp_ps(a, b, _CMP_GE_OS);
-}
-
-__m256d test_cmp_pd256(__m256d a, __m256d b) {
-  // Expects that the third argument in LLVM IR is immediate expression
-  // CHECK: @llvm.x86.avx.cmp.pd.256({{.*}}, i8 13)
-  return _mm256_cmp_pd(a, b, _CMP_GE_OS);
-}
-
-__m256d test_cmp_ps256(__m256 a, __m256 b) {
-  // Expects that the third argument in LLVM IR is immediate expression
-  // CHECK: @llvm.x86.avx.cmp.ps.256({{.*}}, i8 13)
-  return _mm256_cmp_ps(a, b, _CMP_GE_OS);
-}
-
 __m128d test_cmp_sd(__m128d a, __m128d b) {
   // Expects that the third argument in LLVM IR is immediate expression
   // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 13)
diff --git a/test/CodeGen/avx-shuffle-builtins.c b/test/CodeGen/avx-shuffle-builtins.c
index b5bac89bd1ec..fef2879abd55 100644
--- a/test/CodeGen/avx-shuffle-builtins.c
+++ b/test/CodeGen/avx-shuffle-builtins.c
@@ -103,7 +103,7 @@ __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
 
 __m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
   // CHECK-LABEL: @test_mm256_insertf128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
+  // CHECK: shufflevector{{.*}}<i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   return _mm256_insertf128_si256(a, b, 0);
 }
 
@@ -121,7 +121,7 @@ __m256d test_mm256_insertf128_pd_1(__m256d a, __m128d b) {
 
 __m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
   // CHECK-LABEL: @test_mm256_insertf128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf128_si256(a, b, 1);
 }
 
@@ -141,7 +141,7 @@ __m128d test_mm256_extractf128_pd_0(__m256d a) {
 
 __m128i test_mm256_extractf128_si256_0(__m256i a) {
   // CHECK-LABEL: @test_mm256_extractf128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_extractf128_si256(a, 0);
 }
 
@@ -159,7 +159,7 @@ __m128d test_mm256_extractf128_pd_1(__m256d a) {
 
 __m128i test_mm256_extractf128_si256_1(__m256i a) {
   // CHECK-LABEL: @test_mm256_extractf128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 2, i32 3>
+  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_si256(a, 1);
 }
 
@@ -171,13 +171,13 @@ __m256 test_mm256_set_m128(__m128 hi, __m128 lo) {
 
 __m256d test_mm256_set_m128d(__m128d hi, __m128d lo) {
   // CHECK-LABEL: @test_mm256_set_m128d
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_set_m128d(hi, lo);
 }
 
 __m256i test_mm256_set_m128i(__m128i hi, __m128i lo) {
   // CHECK-LABEL: @test_mm256_set_m128i
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>  
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>  
   return _mm256_set_m128i(hi, lo);
 }
 
@@ -189,12 +189,12 @@ __m256 test_mm256_setr_m128(__m128 hi, __m128 lo) {
 
 __m256d test_mm256_setr_m128d(__m128d hi, __m128d lo) {
   // CHECK-LABEL: @test_mm256_setr_m128d
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128d(lo, hi);
 }
 
 __m256i test_mm256_setr_m128i(__m128i hi, __m128i lo) {
   // CHECK-LABEL: @test_mm256_setr_m128i
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128i(lo, hi);
 }
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c
index f79f60e6db78..b0f4b5e6b55b 100644
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
 
@@ -141,7 +141,7 @@ __m128i test_mm_blend_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_blend_epi32
   // CHECK-NOT: @llvm.x86.avx2.pblendd.128
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm_blend_epi32(a, b, 0x35);
+  return _mm_blend_epi32(a, b, 0x05);
 }
 
 __m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
@@ -248,13 +248,13 @@ __m256i test_mm256_broadcastw_epi16(__m128i a) {
 
 __m256i test_mm256_bslli_epi128(__m256i a) {
   // CHECK-LABEL: test_mm256_bslli_epi128
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   return _mm256_bslli_epi128(a, 3);
 }
 
 __m256i test_mm256_bsrli_epi128(__m256i a) {
   // CHECK-LABEL: test_mm256_bsrli_epi128
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   return _mm256_bsrli_epi128(a, 3);
 }
 
@@ -386,21 +386,21 @@ __m256i test_mm256_cvtepu32_epi64(__m128i a) {
 
 __m128i test0_mm256_extracti128_si256_0(__m256i a) {
   // CHECK-LABEL: test0_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   return _mm256_extracti128_si256(a, 0);
 }
 
 __m128i test1_mm256_extracti128_si256_1(__m256i a) {
   // CHECK-LABEL: test1_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti128_si256(a, 1);
 }
 
 // Immediate should be truncated to one bit.
 __m128i test2_mm256_extracti128_si256(__m256i a) {
   // CHECK-LABEL: test2_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
-  return _mm256_extracti128_si256(a, 2);
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  return _mm256_extracti128_si256(a, 0);
 }
 
 __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
@@ -612,9 +612,7 @@ __m128d test_mm_mask_i64gather_pd(__m128d a, double const *b, __m128i c, __m128d
 
 __m256d test_mm256_i64gather_pd(double const *b, __m256i c) {
   // CHECK-LABEL: test_mm256_i64gather_pd
-  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <4 x double>
+  // CHECK: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
   // CHECK: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> zeroinitializer, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
   return _mm256_i64gather_pd(b, c, 2);
 }
@@ -657,14 +655,14 @@ __m128 test_mm256_mask_i64gather_ps(__m128 a, float const *b, __m256i c, __m128
 
 __m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
   // CHECK-LABEL: test0_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_inserti128_si256(a, b, 0);
 }
 
 __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
   // CHECK-LABEL: test1_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_inserti128_si256(a, b, 1);
 }
@@ -672,9 +670,9 @@ __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
 // Immediate should be truncated to one bit.
 __m256i test2_mm256_inserti128_si256(__m256i a, __m128i b) {
   // CHECK-LABEL: test2_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  return _mm256_inserti128_si256(a, b, 2);
+  return _mm256_inserti128_si256(a, b, 0);
 }
 
 __m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
@@ -835,13 +833,19 @@ __m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
 
 __m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epi32
-  // CHECK: call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  // CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  // CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  // CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epi32(a, b);
 }
 
 __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mul_epu32
-  // CHECK: call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  // CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epu32(a, b);
 }
 
@@ -913,13 +917,13 @@ __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
 
 __m256i test_mm256_permute4x64_epi64(__m256i a) {
   // CHECK-LABEL: test_mm256_permute4x64_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
   return _mm256_permute4x64_epi64(a, 35);
 }
 
 __m256d test_mm256_permute4x64_pd(__m256d a) {
   // CHECK-LABEL: test_mm256_permute4x64_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
   return _mm256_permute4x64_pd(a, 25);
 }
 
@@ -949,19 +953,19 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
 
 __m256i test_mm256_shuffle_epi32(__m256i a) {
   // CHECK-LABEL: test_mm256_shuffle_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
   return _mm256_shuffle_epi32(a, 15);
 }
 
 __m256i test_mm256_shufflehi_epi16(__m256i a) {
   // CHECK-LABEL: test_mm256_shufflehi_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
   return _mm256_shufflehi_epi16(a, 107);
 }
 
 __m256i test_mm256_shufflelo_epi16(__m256i a) {
   // CHECK-LABEL: test_mm256_shufflelo_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
   return _mm256_shufflelo_epi16(a, 83);
 }
 
@@ -1003,7 +1007,7 @@ __m256i test_mm256_slli_epi64(__m256i a) {
 
 __m256i test_mm256_slli_si256(__m256i a) {
   // CHECK-LABEL: test_mm256_slli_si256
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   return _mm256_slli_si256(a, 3);
 }
 
@@ -1105,7 +1109,7 @@ __m256i test_mm256_srli_epi64(__m256i a) {
 
 __m256i test_mm256_srli_si256(__m256i a) {
   // CHECK-LABEL: test_mm256_srli_si256
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   return _mm256_srli_si256(a, 3);
 }
 
diff --git a/test/CodeGen/avx512-reduceIntrin.c b/test/CodeGen/avx512-reduceIntrin.c
index d24cd0e5634d..8157095fc644 100644
--- a/test/CodeGen/avx512-reduceIntrin.c
+++ b/test/CodeGen/avx512-reduceIntrin.c
@@ -1,410 +1,404 @@
-// RUN: %clang_cc1 -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
 
 long long test_mm512_reduce_add_epi64(__m512i __W){
-  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0
-  // CHECK: ret i64 %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_add_epi64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    add <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    add <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    add <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_reduce_add_epi64(__W);
 }
 
 long long test_mm512_reduce_mul_epi64(__m512i __W){
-  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
-  // CHECK: ret i64 %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_mul_epi64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    mul <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    mul <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    mul <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_reduce_mul_epi64(__W); 
 }
 
 long long test_mm512_reduce_or_epi64(__m512i __W){
-  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0
-  // CHECK: ret i64 %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_or_epi64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    or <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    or <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    or <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_reduce_or_epi64(__W); 
 }
 
 long long test_mm512_reduce_and_epi64(__m512i __W){
-  // CHECK: %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0
-  // CHECK: ret i64 %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_and_epi64(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    and <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    and <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    and <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_reduce_and_epi64(__W);
 }
 
 long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
-  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> zeroinitializer
-  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %add7.i = add <2 x i64> %shuffle6.i, %add4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %add7.i, i32 0
-  // CHECK: ret i64 %vecext.i
+// CHECK-LABEL: @test_mm512_mask_reduce_add_epi64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    add <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    add <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    add <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_mask_reduce_add_epi64(__M, __W); 
 }
 
 long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
-  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
-  // CHECK: ret i64 %vecext.i
+// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    mul <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    mul <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    mul <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_mask_reduce_mul_epi64(__M, __W); 
 }
 
 long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
-  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %and7.i = and <2 x i64> %shuffle6.i, %and4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %and7.i, i32 0
+// CHECK-LABEL: @test_mm512_mask_reduce_and_epi64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    and <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    and <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    and <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_mask_reduce_and_epi64(__M, __W); 
 }
 
 long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
-  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x i64> %__W, <8 x i64> zeroinitializer
-  // CHECK: %shuffle.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x i64> {{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %or7.i = or <2 x i64> %shuffle6.i, %or4.i
-  // CHECK: %vecext.i = extractelement <2 x i64> %or7.i, i32 0
-  // CHECK: ret i64 %vecext.i
+// CHECK-LABEL: @test_mm512_mask_reduce_or_epi64(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    or <4 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    or <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    or <2 x i64> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x i64> %{{.*}}, i32 0
   return _mm512_mask_reduce_or_epi64(__M, __W); 
 }
 
 int test_mm512_reduce_add_epi32(__m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %add7.i = add <4 x i32> %shuffle6.i, %add4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %add10.i = add <4 x i32> %shuffle9.i, %add7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %add10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_reduce_add_epi32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    add <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    add <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    add <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    add <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_reduce_add_epi32(__W); 
 }
 
 int test_mm512_reduce_mul_epi32(__m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %mul10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_reduce_mul_epi32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    mul <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    mul <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    mul <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    mul <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_reduce_mul_epi32(__W); 
 }
 
 int test_mm512_reduce_or_epi32(__m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %or7.i = or <4 x i32> %shuffle6.i, %or4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %or10.i = or <4 x i32> %shuffle9.i, %or7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %or10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_reduce_or_epi32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    or <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    or <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    or <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    or <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_reduce_or_epi32(__W); 
 }
 
 int test_mm512_reduce_and_epi32(__m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %and7.i = and <4 x i32> %shuffle6.i, %and4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %and10.i = and <4 x i32> %shuffle9.i, %and7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %and10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_reduce_and_epi32(
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    and <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    and <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    and <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    and <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_reduce_and_epi32(__W); 
 }
 
 int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
-  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> zeroinitializer
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %add.i = add <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %add7.i = add <4 x i32> %shuffle6.i, %add4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %add10.i = add <4 x i32> %shuffle9.i, %add7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %add10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_mask_reduce_add_epi32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    bitcast <16 x i32> %{{.*}} to <8 x i64>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    add <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    add <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    add <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    add <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_mask_reduce_add_epi32(__M, __W); 
 }
 
 int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
-  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %mul10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_mask_reduce_mul_epi32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    bitcast <16 x i32> %{{.*}} to <8 x i64>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    mul <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    mul <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    mul <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    mul <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_mask_reduce_mul_epi32(__M, __W); 
 }
 
 int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
-  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %and.i = and <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %and7.i = and <4 x i32> %shuffle6.i, %and4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %and10.i = and <4 x i32> %shuffle9.i, %and7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %and10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_mask_reduce_and_epi32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    bitcast <16 x i32> %{{.*}} to <8 x i64>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    and <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    and <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    and <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    and <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_mask_reduce_and_epi32(__M, __W); 
 }
 
 int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){
-  // CHECK: {{.*}} = bitcast <8 x i64> %__W to <16 x i32>
-  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
-  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32> zeroinitializer
-  // CHECK: %shuffle.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x i32> {{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %or.i = or <8 x i32> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %or7.i = or <4 x i32> %shuffle6.i, %or4.i
-  // CHECK: %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %or10.i = or <4 x i32> %shuffle9.i, %or7.i
-  // CHECK: {{.*}} = bitcast <4 x i32> %or10.i to <2 x i64>
-  // CHECK: %vecext.i = extractelement <2 x i64> {{.*}}, i32 0
-  // CHECK: %conv.i = trunc i64 %vecext.i to i32
-  // CHECK: ret i32 %conv.i
+// CHECK-LABEL: @test_mm512_mask_reduce_or_epi32(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+// CHECK:    bitcast <16 x i32> %{{.*}} to <8 x i64>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    or <8 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    or <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    or <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    or <4 x i32> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x i32> %{{.*}}, i32 0
   return _mm512_mask_reduce_or_epi32(__M, __W); 
 }
 
 double test_mm512_reduce_add_pd(__m512d __W){
-  // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
-  // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0
-  // CHECK: ret double %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_add_pd(
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fadd <4 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
   return _mm512_reduce_add_pd(__W); 
 }
 
 double test_mm512_reduce_mul_pd(__m512d __W){
-  // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
-  // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0
-  // CHECK: ret double %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_mul_pd(
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fmul <4 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
   return _mm512_reduce_mul_pd(__W); 
 }
 
 float test_mm512_reduce_add_ps(__m512 __W){
-  // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
-  // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
-  // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0
-  // CHECK: ret float %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_add_ps(
+// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    fadd <8 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
   return _mm512_reduce_add_ps(__W); 
 }
 
 float test_mm512_reduce_mul_ps(__m512 __W){
-  // CHECK: %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
-  // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
-  // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0
-  // CHECK: ret float %vecext.i
+// CHECK-LABEL: @test_mm512_reduce_mul_ps(
+// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    fmul <8 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
   return _mm512_reduce_mul_ps(__W); 
 }
 
 double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){
-  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
-  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x double> %__W, <8 x double> zeroinitializer
-  // CHECK: %shuffle.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
-  // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0
-  // CHECK: ret double %vecext.i
+// CHECK-LABEL: @test_mm512_mask_reduce_add_pd(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fadd <4 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
   return _mm512_mask_reduce_add_pd(__M, __W); 
 }
 
 double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){
-  // CHECK: {{.*}} = bitcast i8 %__M to <8 x i1>
-  // CHECK: {{.*}} = select <8 x i1> {{.*}}, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
-  // CHECK: %shuffle.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle1.i = shufflevector <8 x double> {{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  // CHECK: %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK: %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-  // CHECK: %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
-  // CHECK: %vecext.i = extractelement <2 x double> %mul7.i, i32 0
-  // CHECK: ret double %vecext.i
+// CHECK-LABEL: @test_mm512_mask_reduce_mul_pd(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fmul <4 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
+// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
   return _mm512_mask_reduce_mul_pd(__M, __W); 
 }
 
 float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){
-  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
-  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x float> %__W, <16 x float> zeroinitializer
-  // CHECK: %shuffle.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %add7.i = fadd <4 x float> %add4.i, %shuffle6.i
-  // CHECK: %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %add10.i = fadd <4 x float> %add7.i, %shuffle9.i
-  // CHECK: %vecext.i = extractelement <4 x float> %add10.i, i32 0
-  // CHECK: ret float %vecext.i
+// CHECK-LABEL: @test_mm512_mask_reduce_add_ps(
+// CHECK-NEXT:  entry:
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
+// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    fadd <8 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
   return _mm512_mask_reduce_add_ps(__M, __W); 
 }
 
 float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){
-  // CHECK: {{.*}} = bitcast i16 %__M to <16 x i1>
-  // CHECK: {{.*}} = select <16 x i1> {{.*}}, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  // CHECK: %shuffle.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %shuffle1.i = shufflevector <16 x float> {{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i
-  // CHECK: %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i
-  // CHECK: %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // CHECK: %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i
-  // CHECK: %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // CHECK: %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i
-  // CHECK: %vecext.i = extractelement <4 x float> %mul10.i, i32 0
-  // CHECK: ret float %vecext.i
+// CHECK-LABEL: @test_mm512_mask_reduce_mul_ps(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}}
+// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
+// CHECK:    fmul <8 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
   return _mm512_mask_reduce_mul_ps(__M, __W); 
 }
diff --git a/test/CodeGen/avx512-reduceMinMaxIntrin.c b/test/CodeGen/avx512-reduceMinMaxIntrin.c
index 2081cef75460..fde7c0a273cd 100644
--- a/test/CodeGen/avx512-reduceMinMaxIntrin.c
+++ b/test/CodeGen/avx512-reduceMinMaxIntrin.c
@@ -3,2608 +3,2526 @@
 #include <immintrin.h>
 
 // CHECK-LABEL: define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP4]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = icmp sgt <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK:   [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK:   store <8 x i64> [[TMP9]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP13]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE2_I]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP17:%.*]] = icmp sgt <8 x i64> [[TMP14]], [[TMP15]]
-// CHECK:   [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
-// CHECK:   store <8 x i64> [[TMP18]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP19]], <8 x i64> [[TMP20]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP21]], <8 x i64> [[TMP22]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP26:%.*]] = icmp sgt <8 x i64> [[TMP23]], [[TMP24]]
-// CHECK:   [[TMP27:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP23]], <8 x i64> [[TMP24]]
-// CHECK:   store <8 x i64> [[TMP27]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP28]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <8 x i64> [[TMP5]], [[TMP6]]
+// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
+// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i64> [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
+// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = icmp sgt <8 x i64> [[TMP21]], [[TMP22]]
+// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
+// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 long long test_mm512_reduce_max_epi64(__m512i __W){
   return _mm512_reduce_max_epi64(__W);
 }
 
 // CHECK-LABEL: define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP4]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK:   [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK:   store <8 x i64> [[TMP9]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP13]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE2_I]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP17:%.*]] = icmp ugt <8 x i64> [[TMP14]], [[TMP15]]
-// CHECK:   [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
-// CHECK:   store <8 x i64> [[TMP18]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP19]], <8 x i64> [[TMP20]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP21]], <8 x i64> [[TMP22]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP26:%.*]] = icmp ugt <8 x i64> [[TMP23]], [[TMP24]]
-// CHECK:   [[TMP27:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP23]], <8 x i64> [[TMP24]]
-// CHECK:   store <8 x i64> [[TMP27]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP28]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP6]]
+// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
+// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt <8 x i64> [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
+// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt <8 x i64> [[TMP21]], [[TMP22]]
+// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
+// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 unsigned long long test_mm512_reduce_max_epu64(__m512i __W){
   return _mm512_reduce_max_epu64(__W); 
 }
 
 // CHECK-LABEL: define double @test_mm512_reduce_max_pd(<8 x double> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I8_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I9_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I10_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK:   store <8 x double> %__W, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x double> [[TMP0]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> [[TMP4]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE_I]], <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE1_I]], <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> [[TMP5]], <8 x double> [[TMP6]], <8 x double> [[TMP7]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP8]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP11:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE2_I]], <8 x double>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE3_I]], <8 x double>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP16:%.*]] = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x double> [[TMP15]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP16]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP17:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE5_I]], <8 x double>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE6_I]], <8 x double>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP24:%.*]] = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x double> [[TMP23]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP24]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x double> [[TMP25]], i32 0
-// CHECK:   ret double [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I8_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I9_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x double> [[TMP0]], <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT2_I]], <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[__A_ADDR_I10_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP4]], <4 x double>* [[__B_ADDR_I11_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I10_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I11_I]], align 32
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> [[TMP5]], <4 x double> [[TMP6]]) #2
+// CHECK-NEXT:    store <4 x double> [[TMP7]], <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT4_I]], <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT5_I]], <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[__A_ADDR_I8_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[__B_ADDR_I9_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I8_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I9_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP12]], <2 x double> [[TMP13]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> [[TMP16]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP19]], <2 x double> [[TMP20]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP22]], i32 0
+// CHECK-NEXT:    ret double [[VECEXT_I]]
 double test_mm512_reduce_max_pd(__m512d __W){
   return _mm512_reduce_max_pd(__W); 
 }
 
 // CHECK-LABEL: define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP4]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = icmp sgt <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK:   [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK:   store <8 x i64> [[TMP9]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP13]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE2_I]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP17:%.*]] = icmp sgt <8 x i64> [[TMP14]], [[TMP15]]
-// CHECK:   [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
-// CHECK:   store <8 x i64> [[TMP18]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP19]], <8 x i64> [[TMP20]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP21]], <8 x i64> [[TMP22]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP26:%.*]] = icmp sgt <8 x i64> [[TMP23]], [[TMP24]]
-// CHECK:   [[TMP27:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP23]], <8 x i64> [[TMP24]]
-// CHECK:   store <8 x i64> [[TMP27]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP28]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i64> [[TMP5]], [[TMP6]]
+// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
+// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i64> [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
+// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = icmp slt <8 x i64> [[TMP21]], [[TMP22]]
+// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
+// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 long long test_mm512_reduce_min_epi64(__m512i __W){
-  return _mm512_reduce_max_epi64(__W);
+  return _mm512_reduce_min_epi64(__W);
 }
 
 // CHECK-LABEL: define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I8_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP4]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP6]]
-// CHECK:   [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
-// CHECK:   store <8 x i64> [[TMP9]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP13]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE2_I]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP17:%.*]] = icmp ugt <8 x i64> [[TMP14]], [[TMP15]]
-// CHECK:   [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
-// CHECK:   store <8 x i64> [[TMP18]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP19]], <8 x i64> [[TMP20]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP21]], <8 x i64> [[TMP22]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP26:%.*]] = icmp ugt <8 x i64> [[TMP23]], [[TMP24]]
-// CHECK:   [[TMP27:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP23]], <8 x i64> [[TMP24]]
-// CHECK:   store <8 x i64> [[TMP27]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP28]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <8 x i64> [[TMP5]], [[TMP6]]
+// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
+// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp ult <8 x i64> [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
+// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I5_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = icmp ult <8 x i64> [[TMP21]], [[TMP22]]
+// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
+// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP25]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 unsigned long long test_mm512_reduce_min_epu64(__m512i __W){
-  return _mm512_reduce_max_epu64(__W); 
+  return _mm512_reduce_min_epu64(__W);
 }
 
 // CHECK-LABEL: define double @test_mm512_reduce_min_pd(<8 x double> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I8_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I9_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I10_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK:   store <8 x double> %__W, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x double> [[TMP0]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> [[TMP4]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE_I]], <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE1_I]], <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> [[TMP5]], <8 x double> [[TMP6]], <8 x double> [[TMP7]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP8]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP11:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> [[TMP12]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE2_I]], <8 x double>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE3_I]], <8 x double>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP16:%.*]] = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x double> [[TMP15]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP16]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP17:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE5_I]], <8 x double>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE6_I]], <8 x double>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I9_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I10_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I8_I]], align 64
-// CHECK:   [[TMP24:%.*]] = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x double> [[TMP23]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP24]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x double> [[TMP25]], i32 0
-// CHECK:   ret double [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I8_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I9_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x double> [[TMP0]], <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT2_I]], <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[__A_ADDR_I10_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP4]], <4 x double>* [[__B_ADDR_I11_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I10_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I11_I]], align 32
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> [[TMP5]], <4 x double> [[TMP6]]) #2
+// CHECK-NEXT:    store <4 x double> [[TMP7]], <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT4_I]], <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT5_I]], <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[__A_ADDR_I8_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[__B_ADDR_I9_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I8_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I9_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP12]], <2 x double> [[TMP13]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> [[TMP16]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP19]], <2 x double> [[TMP20]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP22]], i32 0
+// CHECK-NEXT:    ret double [[VECEXT_I]]
 double test_mm512_reduce_min_pd(__m512d __W){
   return _mm512_reduce_min_pd(__W); 
 }
 
 // CHECK-LABEL: define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i8 %__M, i8* [[__M_ADDR]], align 1
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   store i64 -9223372036854775808, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP11]], i32 7
-// CHECK:   store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP13:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-// CHECK:   [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP3]], <8 x i64> [[TMP12]]
-// CHECK:   store <8 x i64> [[TMP14]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP22:%.*]] = icmp sgt <8 x i64> [[TMP19]], [[TMP20]]
-// CHECK:   [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP19]], <8 x i64> [[TMP20]]
-// CHECK:   store <8 x i64> [[TMP23]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP24]], <8 x i64> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE4_I]], <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP31:%.*]] = icmp sgt <8 x i64> [[TMP28]], [[TMP29]]
-// CHECK:   [[TMP32:%.*]] = select <8 x i1> [[TMP31]], <8 x i64> [[TMP28]], <8 x i64> [[TMP29]]
-// CHECK:   store <8 x i64> [[TMP32]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP33]], <8 x i64> [[TMP34]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <8 x i64> [[TMP35]], <8 x i64> [[TMP36]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE7_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP40:%.*]] = icmp sgt <8 x i64> [[TMP37]], [[TMP38]]
-// CHECK:   [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP37]], <8 x i64> [[TMP38]]
-// CHECK:   store <8 x i64> [[TMP41]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i64 -9223372036854775808, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP9]], i32 7
+// CHECK-NEXT:    store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP10]], <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__A_ADDR_I11_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I11_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
+// CHECK-NEXT:    store <8 x i64> [[TMP17]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP18]], <8 x i64> [[TMP19]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = icmp sgt <8 x i64> [[TMP22]], [[TMP23]]
+// CHECK-NEXT:    [[TMP25:%.*]] = select <8 x i1> [[TMP24]], <8 x i64> [[TMP22]], <8 x i64> [[TMP23]]
+// CHECK-NEXT:    store <8 x i64> [[TMP25]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP32:%.*]] = icmp sgt <8 x i64> [[TMP30]], [[TMP31]]
+// CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP32]], <8 x i64> [[TMP30]], <8 x i64> [[TMP31]]
+// CHECK-NEXT:    store <8 x i64> [[TMP33]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP34]], <8 x i64> [[TMP35]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP36]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP37]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP40:%.*]] = icmp sgt <8 x i64> [[TMP38]], [[TMP39]]
+// CHECK-NEXT:    [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP38]], <8 x i64> [[TMP39]]
+// CHECK-NEXT:    store <8 x i64> [[TMP41]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
   return _mm512_mask_reduce_max_epi64(__M, __W); 
 }
 
 // CHECK-LABEL: define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i8 %__M, i8* [[__M_ADDR]], align 1
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   store i64 0, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP11]], i32 7
-// CHECK:   store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP13:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-// CHECK:   [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP3]], <8 x i64> [[TMP12]]
-// CHECK:   store <8 x i64> [[TMP14]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP22:%.*]] = icmp ugt <8 x i64> [[TMP19]], [[TMP20]]
-// CHECK:   [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP19]], <8 x i64> [[TMP20]]
-// CHECK:   store <8 x i64> [[TMP23]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP24]], <8 x i64> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE4_I]], <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP31:%.*]] = icmp ugt <8 x i64> [[TMP28]], [[TMP29]]
-// CHECK:   [[TMP32:%.*]] = select <8 x i1> [[TMP31]], <8 x i64> [[TMP28]], <8 x i64> [[TMP29]]
-// CHECK:   store <8 x i64> [[TMP32]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP33]], <8 x i64> [[TMP34]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <8 x i64> [[TMP35]], <8 x i64> [[TMP36]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE7_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP40:%.*]] = icmp ugt <8 x i64> [[TMP37]], [[TMP38]]
-// CHECK:   [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP37]], <8 x i64> [[TMP38]]
-// CHECK:   store <8 x i64> [[TMP41]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP2]], i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[TMP6]]
+// CHECK-NEXT:    store <8 x i64> [[TMP8]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp ugt <8 x i64> [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP15]], <8 x i64> [[TMP13]], <8 x i64> [[TMP14]]
+// CHECK-NEXT:    store <8 x i64> [[TMP16]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE2_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE2_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt <8 x i64> [[TMP21]], [[TMP22]]
+// CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[TMP22]]
+// CHECK-NEXT:    store <8 x i64> [[TMP24]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE4_I:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE4_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP27]], <8 x i64>* [[__A_ADDR_I6_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I6_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP31:%.*]] = icmp ugt <8 x i64> [[TMP29]], [[TMP30]]
+// CHECK-NEXT:    [[TMP32:%.*]] = select <8 x i1> [[TMP31]], <8 x i64> [[TMP29]], <8 x i64> [[TMP30]]
+// CHECK-NEXT:    store <8 x i64> [[TMP32]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP33]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
   return _mm512_mask_reduce_max_epu64(__M, __W); 
 }
 
-// CHECK-LABEL: define i64 @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I9_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I10_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I11_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__W_ADDR_I_I:%.*]] = alloca double, align 8
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK:   store i8 %__M, i8* [[__M_ADDR]], align 1
-// CHECK:   store <8 x double> %__W, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK:   [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK:   store <8 x double> [[TMP1]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK:   [[TMP3:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   store double 0x7FF0000000000000, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <8 x double> undef, double [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <8 x double> [[VECINIT_I_I]], double [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <8 x double> [[VECINIT1_I_I]], double [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <8 x double> [[VECINIT2_I_I]], double [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <8 x double> [[VECINIT3_I_I]], double [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <8 x double> [[VECINIT4_I_I]], double [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <8 x double> [[VECINIT5_I_I]], double [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <8 x double> [[VECINIT6_I_I]], double [[TMP11]], i32 7
-// CHECK:   store <8 x double> [[VECINIT7_I_I]], <8 x double>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[SUB_I:%.*]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, [[TMP12]]
-// CHECK:   [[TMP13:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-// CHECK:   [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP3]], <8 x double> [[SUB_I]]
-// CHECK:   store <8 x double> [[TMP14]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP17:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE_I]], <8 x double>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE1_I]], <8 x double>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP22:%.*]] = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x double> [[TMP21]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP22]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP25:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> [[TMP26]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE3_I]], <8 x double>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE4_I]], <8 x double>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP30:%.*]] = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> [[TMP27]], <8 x double> [[TMP28]], <8 x double> [[TMP29]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP30]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP31:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x double> [[TMP31]], <8 x double> [[TMP32]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP33:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> [[TMP34]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE6_I]], <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE7_I]], <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP35:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP38:%.*]] = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> [[TMP35]], <8 x double> [[TMP36]], <8 x double> [[TMP37]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP38]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x double> [[TMP39]], i32 0
-// CHECK:   [[CONV:%.*]] = fptosi double [[VECEXT_I]] to i64
-// CHECK:   ret i64 [[CONV]]
-long long test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
+// CHECK-LABEL: define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) #0 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x double> [[TMP1]], <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store double 0xFFF0000000000000, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x double> [[VECINIT_I_I]], double [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x double> [[VECINIT1_I_I]], double [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x double> [[VECINIT2_I_I]], double [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x double> [[VECINIT3_I_I]], double [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x double> [[VECINIT4_I_I]], double [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x double> [[VECINIT5_I_I]], double [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x double> [[VECINIT6_I_I]], double [[TMP9]], i32 7
+// CHECK-NEXT:    store <8 x double> [[VECINIT7_I_I]], <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x double>, <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x double> [[TMP10]], <8 x double>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    store <8 x double> [[TMP12]], <8 x double>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x double> [[TMP14]], <8 x double> [[TMP15]]
+// CHECK-NEXT:    store <8 x double> [[TMP17]], <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT4_I]], <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP20]], <4 x double>* [[__A_ADDR_I12_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP21]], <4 x double>* [[__B_ADDR_I13_I]], align 32
+// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I12_I]], align 32
+// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I13_I]], align 32
+// CHECK-NEXT:    [[TMP24:%.*]] = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> [[TMP22]], <4 x double> [[TMP23]]) #2
+// CHECK-NEXT:    store <4 x double> [[TMP24]], <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT6_I]], <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT7_I]], <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP27]], <2 x double>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP29]], <2 x double> [[TMP30]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP31]], <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP32]], <2 x double> [[TMP33]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP35:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP34]], <2 x double>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP35]], <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = load <2 x double>, <2 x double>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP36]], <2 x double> [[TMP37]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP38]], <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP39]], i32 0
+// CHECK-NEXT:    ret double [[VECEXT_I]]
+double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
   return _mm512_mask_reduce_max_pd(__M, __W); 
 }
 
 // CHECK-LABEL: define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i8 %__M, i8* [[__M_ADDR]], align 1
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   store i64 9223372036854775807, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP11]], i32 7
-// CHECK:   store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP13:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-// CHECK:   [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP3]], <8 x i64> [[TMP12]]
-// CHECK:   store <8 x i64> [[TMP14]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP22:%.*]] = icmp slt <8 x i64> [[TMP19]], [[TMP20]]
-// CHECK:   [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP19]], <8 x i64> [[TMP20]]
-// CHECK:   store <8 x i64> [[TMP23]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP24]], <8 x i64> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE4_I]], <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP31:%.*]] = icmp slt <8 x i64> [[TMP28]], [[TMP29]]
-// CHECK:   [[TMP32:%.*]] = select <8 x i1> [[TMP31]], <8 x i64> [[TMP28]], <8 x i64> [[TMP29]]
-// CHECK:   store <8 x i64> [[TMP32]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP33]], <8 x i64> [[TMP34]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <8 x i64> [[TMP35]], <8 x i64> [[TMP36]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE7_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP40:%.*]] = icmp slt <8 x i64> [[TMP37]], [[TMP38]]
-// CHECK:   [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP37]], <8 x i64> [[TMP38]]
-// CHECK:   store <8 x i64> [[TMP41]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i64 9223372036854775807, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP9]], i32 7
+// CHECK-NEXT:    store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP10]], <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__A_ADDR_I11_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I11_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
+// CHECK-NEXT:    store <8 x i64> [[TMP17]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP18]], <8 x i64> [[TMP19]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = icmp slt <8 x i64> [[TMP22]], [[TMP23]]
+// CHECK-NEXT:    [[TMP25:%.*]] = select <8 x i1> [[TMP24]], <8 x i64> [[TMP22]], <8 x i64> [[TMP23]]
+// CHECK-NEXT:    store <8 x i64> [[TMP25]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP32:%.*]] = icmp slt <8 x i64> [[TMP30]], [[TMP31]]
+// CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP32]], <8 x i64> [[TMP30]], <8 x i64> [[TMP31]]
+// CHECK-NEXT:    store <8 x i64> [[TMP33]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP34]], <8 x i64> [[TMP35]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP36]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP37]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i64> [[TMP38]], [[TMP39]]
+// CHECK-NEXT:    [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP38]], <8 x i64> [[TMP39]]
+// CHECK-NEXT:    store <8 x i64> [[TMP41]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
   return _mm512_mask_reduce_min_epi64(__M, __W); 
 }
 
 // CHECK-LABEL: define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I9_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i8 %__M, i8* [[__M_ADDR]], align 1
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   store i64 0, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP11]], i32 7
-// CHECK:   store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP13:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-// CHECK:   [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP3]], <8 x i64> [[TMP12]]
-// CHECK:   store <8 x i64> [[TMP14]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP17:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> [[TMP18]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE1_I]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP22:%.*]] = icmp ugt <8 x i64> [[TMP19]], [[TMP20]]
-// CHECK:   [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP19]], <8 x i64> [[TMP20]]
-// CHECK:   store <8 x i64> [[TMP23]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP24]], <8 x i64> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE4_I]], <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP31:%.*]] = icmp ugt <8 x i64> [[TMP28]], [[TMP29]]
-// CHECK:   [[TMP32:%.*]] = select <8 x i1> [[TMP31]], <8 x i64> [[TMP28]], <8 x i64> [[TMP29]]
-// CHECK:   store <8 x i64> [[TMP32]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x i64> [[TMP33]], <8 x i64> [[TMP34]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <8 x i64> [[TMP35]], <8 x i64> [[TMP36]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x i64> [[SHUFFLE6_I]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[SHUFFLE7_I]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP40:%.*]] = icmp ugt <8 x i64> [[TMP37]], [[TMP38]]
-// CHECK:   [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP37]], <8 x i64> [[TMP38]]
-// CHECK:   store <8 x i64> [[TMP41]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
-// CHECK:   ret i64 [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I11_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I9_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I10_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I7_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I8_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i64 -1, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i64> [[VECINIT_I_I]], i64 [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i64> [[VECINIT1_I_I]], i64 [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i64> [[VECINIT2_I_I]], i64 [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i64> [[VECINIT3_I_I]], i64 [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i64> [[VECINIT4_I_I]], i64 [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i64> [[VECINIT5_I_I]], i64 [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[__D_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i64> [[VECINIT6_I_I]], i64 [[TMP9]], i32 7
+// CHECK-NEXT:    store <8 x i64> [[VECINIT7_I_I]], <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP10]], <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP12]], <8 x i64>* [[__A_ADDR_I11_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I11_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP14]], <8 x i64> [[TMP15]]
+// CHECK-NEXT:    store <8 x i64> [[TMP17]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i64> [[TMP18]], <8 x i64> [[TMP19]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE_I]], <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__T1_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I9_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I10_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = icmp ult <8 x i64> [[TMP22]], [[TMP23]]
+// CHECK-NEXT:    [[TMP25:%.*]] = select <8 x i1> [[TMP24]], <8 x i64> [[TMP22]], <8 x i64> [[TMP23]]
+// CHECK-NEXT:    store <8 x i64> [[TMP25]], <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE3_I:%.*]] = shufflevector <8 x i64> [[TMP26]], <8 x i64> [[TMP27]], <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE3_I]], <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__T2_I]], align 64
+// CHECK-NEXT:    [[TMP29:%.*]] = load <8 x i64>, <8 x i64>* [[__T3_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP28]], <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I7_I]], align 64
+// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I8_I]], align 64
+// CHECK-NEXT:    [[TMP32:%.*]] = icmp ult <8 x i64> [[TMP30]], [[TMP31]]
+// CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP32]], <8 x i64> [[TMP30]], <8 x i64> [[TMP31]]
+// CHECK-NEXT:    store <8 x i64> [[TMP33]], <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE5_I:%.*]] = shufflevector <8 x i64> [[TMP34]], <8 x i64> [[TMP35]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    store <8 x i64> [[SHUFFLE5_I]], <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__T4_I]], align 64
+// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x i64>, <8 x i64>* [[__T5_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP36]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP37]], <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP40:%.*]] = icmp ult <8 x i64> [[TMP38]], [[TMP39]]
+// CHECK-NEXT:    [[TMP41:%.*]] = select <8 x i1> [[TMP40]], <8 x i64> [[TMP38]], <8 x i64> [[TMP39]]
+// CHECK-NEXT:    store <8 x i64> [[TMP41]], <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[TMP42:%.*]] = load <8 x i64>, <8 x i64>* [[__T6_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP42]], i32 0
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
 long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
-  return _mm512_mask_reduce_max_epu64(__M, __W); 
+  return _mm512_mask_reduce_min_epu64(__M, __W);
 }
 
 // CHECK-LABEL: define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I9_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I10_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I11_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__W_ADDR_I_I:%.*]] = alloca double, align 8
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i8, align 1
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i8, align 1
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
-// CHECK:   store i8 %__M, i8* [[__M_ADDR]], align 1
-// CHECK:   store <8 x double> %__W, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
-// CHECK:   [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
-// CHECK:   store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
-// CHECK:   store <8 x double> [[TMP1]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
-// CHECK:   [[TMP3:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   store double 0x7FF0000000000000, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <8 x double> undef, double [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <8 x double> [[VECINIT_I_I]], double [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <8 x double> [[VECINIT1_I_I]], double [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <8 x double> [[VECINIT2_I_I]], double [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <8 x double> [[VECINIT3_I_I]], double [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <8 x double> [[VECINIT4_I_I]], double [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <8 x double> [[VECINIT5_I_I]], double [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <8 x double> [[VECINIT6_I_I]], double [[TMP11]], i32 7
-// CHECK:   store <8 x double> [[VECINIT7_I_I]], <8 x double>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP13:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-// CHECK:   [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP3]], <8 x double> [[TMP12]]
-// CHECK:   store <8 x double> [[TMP14]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP16:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP17:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE_I]], <8 x double>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE1_I]], <8 x double>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP22:%.*]] = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x double> [[TMP21]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP22]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP25:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> [[TMP26]], <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE3_I]], <8 x double>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE4_I]], <8 x double>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I10_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I11_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I9_I]], align 64
-// CHECK:   [[TMP30:%.*]] = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> [[TMP27]], <8 x double> [[TMP28]], <8 x double> [[TMP29]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP30]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP31:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <8 x double> [[TMP31]], <8 x double> [[TMP32]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP33:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> [[TMP34]], <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <8 x double> [[SHUFFLE6_I]], <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> [[SHUFFLE7_I]], <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP35:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x double>, <8 x double>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <8 x double> zeroinitializer, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <8 x double>, <8 x double>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP38:%.*]] = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> [[TMP35]], <8 x double> [[TMP36]], <8 x double> [[TMP37]], i8 -1, i32 4) #2
-// CHECK:   store <8 x double> [[TMP38]], <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x double> [[TMP39]], i32 0
-// CHECK:   ret double [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store i8 [[__M:%.*]], i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x double> [[__W:%.*]], <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x double> [[TMP1]], <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store double 0x7FF0000000000000, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x double> [[VECINIT_I_I]], double [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x double> [[VECINIT1_I_I]], double [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x double> [[VECINIT2_I_I]], double [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x double> [[VECINIT3_I_I]], double [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x double> [[VECINIT4_I_I]], double [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x double> [[VECINIT5_I_I]], double [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load double, double* [[__W_ADDR_I_I]], align 8
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x double> [[VECINIT6_I_I]], double [[TMP9]], i32 7
+// CHECK-NEXT:    store <8 x double> [[VECINIT7_I_I]], <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x double>, <8 x double>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[__M_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x double> [[TMP10]], <8 x double>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP11]], i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    store <8 x double> [[TMP12]], <8 x double>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[__U_ADDR_I_I]], align 1
+// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x double>, <8 x double>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP15:%.*]] = load <8 x double>, <8 x double>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+// CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP16]], <8 x double> [[TMP14]], <8 x double> [[TMP15]]
+// CHECK-NEXT:    store <8 x double> [[TMP17]], <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT_I]], <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP19:%.*]] = load <8 x double>, <8 x double>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x double> [[EXTRACT4_I]], <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x double>, <4 x double>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x double>, <4 x double>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP20]], <4 x double>* [[__A_ADDR_I12_I]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP21]], <4 x double>* [[__B_ADDR_I13_I]], align 32
+// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x double>, <4 x double>* [[__A_ADDR_I12_I]], align 32
+// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x double>, <4 x double>* [[__B_ADDR_I13_I]], align 32
+// CHECK-NEXT:    [[TMP24:%.*]] = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> [[TMP22]], <4 x double> [[TMP23]]) #2
+// CHECK-NEXT:    store <4 x double> [[TMP24]], <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT6_I]], <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x double>, <4 x double>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x double> [[EXTRACT7_I]], <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x double>, <2 x double>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x double>, <2 x double>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP27]], <2 x double>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x double>, <2 x double>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP29]], <2 x double> [[TMP30]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP31]], <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP32]], <2 x double> [[TMP33]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    store <2 x double> [[SHUFFLE_I]], <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = load <2 x double>, <2 x double>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP35:%.*]] = load <2 x double>, <2 x double>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP34]], <2 x double>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP35]], <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP36:%.*]] = load <2 x double>, <2 x double>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x double>, <2 x double>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP36]], <2 x double> [[TMP37]]) #2
+// CHECK-NEXT:    store <2 x double> [[TMP38]], <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x double>, <2 x double>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP39]], i32 0
+// CHECK-NEXT:    ret double [[VECEXT_I]]
 double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
   return _mm512_mask_reduce_min_pd(__M, __W); 
 }
 
 // CHECK-LABEL: define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i64> [[TMP6]] to <16 x i32>
-// CHECK:   [[TMP8:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i64> [[TMP8]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP5]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP10]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i64> [[TMP11]] to <16 x i32>
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i64> [[TMP15]] to <16 x i32>
-// CHECK:   [[TMP17:%.*]] = icmp sgt <16 x i32> [[TMP12]], [[TMP14]]
-// CHECK:   [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP12]], <16 x i32> [[TMP14]]
-// CHECK:   [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP19]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP21:%.*]] = bitcast <8 x i64> [[TMP20]] to <16 x i32>
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = bitcast <8 x i64> [[TMP22]] to <16 x i32>
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <16 x i32> [[TMP21]], <16 x i32> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP24:%.*]] = bitcast <16 x i32> [[SHUFFLE2_I]] to <8 x i64>
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = bitcast <8 x i64> [[TMP27]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> [[TMP28]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP29:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP24]], <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   [[TMP31:%.*]] = bitcast <8 x i64> [[TMP30]] to <16 x i32>
-// CHECK:   [[TMP32:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP33:%.*]] = bitcast <8 x i64> [[TMP32]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP35:%.*]] = bitcast <8 x i64> [[TMP34]] to <16 x i32>
-// CHECK:   [[TMP36:%.*]] = icmp sgt <16 x i32> [[TMP31]], [[TMP33]]
-// CHECK:   [[TMP37:%.*]] = select <16 x i1> [[TMP36]], <16 x i32> [[TMP31]], <16 x i32> [[TMP33]]
-// CHECK:   [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP38]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP40:%.*]] = bitcast <8 x i64> [[TMP39]] to <16 x i32>
-// CHECK:   [[TMP41:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = bitcast <8 x i64> [[TMP41]] to <16 x i32>
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <16 x i32> [[TMP40]], <16 x i32> [[TMP42]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP43:%.*]] = bitcast <16 x i32> [[SHUFFLE5_I]] to <8 x i64>
-// CHECK:   [[TMP44:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = bitcast <8 x i64> [[TMP44]] to <16 x i32>
-// CHECK:   [[TMP46:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP47:%.*]] = bitcast <8 x i64> [[TMP46]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP47]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP48:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP43]], <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   store <8 x i64> [[TMP48]], <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP49:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   [[TMP50:%.*]] = bitcast <8 x i64> [[TMP49]] to <16 x i32>
-// CHECK:   [[TMP51:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP52:%.*]] = bitcast <8 x i64> [[TMP51]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP53:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP54:%.*]] = bitcast <8 x i64> [[TMP53]] to <16 x i32>
-// CHECK:   [[TMP55:%.*]] = icmp sgt <16 x i32> [[TMP50]], [[TMP52]]
-// CHECK:   [[TMP56:%.*]] = select <16 x i1> [[TMP55]], <16 x i32> [[TMP50]], <16 x i32> [[TMP52]]
-// CHECK:   [[TMP57:%.*]] = bitcast <16 x i32> [[TMP56]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP57]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP58:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP59:%.*]] = bitcast <8 x i64> [[TMP58]] to <16 x i32>
-// CHECK:   [[TMP60:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP61:%.*]] = bitcast <8 x i64> [[TMP60]] to <16 x i32>
-// CHECK:   [[SHUFFLE8_I:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP62:%.*]] = bitcast <16 x i32> [[SHUFFLE8_I]] to <8 x i64>
-// CHECK:   [[TMP63:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = bitcast <8 x i64> [[TMP63]] to <16 x i32>
-// CHECK:   [[TMP65:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP66:%.*]] = bitcast <8 x i64> [[TMP65]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP66]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP67:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP62]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[TMP67]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP68:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP69:%.*]] = bitcast <8 x i64> [[TMP68]] to <16 x i32>
-// CHECK:   [[TMP70:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP71:%.*]] = bitcast <8 x i64> [[TMP70]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP72:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP73:%.*]] = bitcast <8 x i64> [[TMP72]] to <16 x i32>
-// CHECK:   [[TMP74:%.*]] = icmp sgt <16 x i32> [[TMP69]], [[TMP71]]
-// CHECK:   [[TMP75:%.*]] = select <16 x i1> [[TMP74]], <16 x i32> [[TMP69]], <16 x i32> [[TMP71]]
-// CHECK:   [[TMP76:%.*]] = bitcast <16 x i32> [[TMP75]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP76]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP77:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP77]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <8 x i32> [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
+// CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt <4 x i32> [[TMP17]], [[TMP19]]
+// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp sgt <4 x i32> [[TMP31]], [[TMP33]]
+// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = icmp sgt <4 x i32> [[TMP45]], [[TMP47]]
+// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP49]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP51]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 int test_mm512_reduce_max_epi32(__m512i __W){
   return _mm512_reduce_max_epi32(__W);
 }
 
 // CHECK-LABEL: define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i64> [[TMP6]] to <16 x i32>
-// CHECK:   [[TMP8:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i64> [[TMP8]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP5]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP10]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i64> [[TMP11]] to <16 x i32>
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i64> [[TMP15]] to <16 x i32>
-// CHECK:   [[TMP17:%.*]] = icmp ugt <16 x i32> [[TMP12]], [[TMP14]]
-// CHECK:   [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP12]], <16 x i32> [[TMP14]]
-// CHECK:   [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP19]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP21:%.*]] = bitcast <8 x i64> [[TMP20]] to <16 x i32>
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = bitcast <8 x i64> [[TMP22]] to <16 x i32>
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <16 x i32> [[TMP21]], <16 x i32> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP24:%.*]] = bitcast <16 x i32> [[SHUFFLE2_I]] to <8 x i64>
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = bitcast <8 x i64> [[TMP27]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> [[TMP28]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP29:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP24]], <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   [[TMP31:%.*]] = bitcast <8 x i64> [[TMP30]] to <16 x i32>
-// CHECK:   [[TMP32:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP33:%.*]] = bitcast <8 x i64> [[TMP32]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP35:%.*]] = bitcast <8 x i64> [[TMP34]] to <16 x i32>
-// CHECK:   [[TMP36:%.*]] = icmp ugt <16 x i32> [[TMP31]], [[TMP33]]
-// CHECK:   [[TMP37:%.*]] = select <16 x i1> [[TMP36]], <16 x i32> [[TMP31]], <16 x i32> [[TMP33]]
-// CHECK:   [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP38]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP40:%.*]] = bitcast <8 x i64> [[TMP39]] to <16 x i32>
-// CHECK:   [[TMP41:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = bitcast <8 x i64> [[TMP41]] to <16 x i32>
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <16 x i32> [[TMP40]], <16 x i32> [[TMP42]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP43:%.*]] = bitcast <16 x i32> [[SHUFFLE5_I]] to <8 x i64>
-// CHECK:   [[TMP44:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = bitcast <8 x i64> [[TMP44]] to <16 x i32>
-// CHECK:   [[TMP46:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP47:%.*]] = bitcast <8 x i64> [[TMP46]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP47]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP48:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP43]], <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   store <8 x i64> [[TMP48]], <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP49:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   [[TMP50:%.*]] = bitcast <8 x i64> [[TMP49]] to <16 x i32>
-// CHECK:   [[TMP51:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP52:%.*]] = bitcast <8 x i64> [[TMP51]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP53:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP54:%.*]] = bitcast <8 x i64> [[TMP53]] to <16 x i32>
-// CHECK:   [[TMP55:%.*]] = icmp ugt <16 x i32> [[TMP50]], [[TMP52]]
-// CHECK:   [[TMP56:%.*]] = select <16 x i1> [[TMP55]], <16 x i32> [[TMP50]], <16 x i32> [[TMP52]]
-// CHECK:   [[TMP57:%.*]] = bitcast <16 x i32> [[TMP56]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP57]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP58:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP59:%.*]] = bitcast <8 x i64> [[TMP58]] to <16 x i32>
-// CHECK:   [[TMP60:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP61:%.*]] = bitcast <8 x i64> [[TMP60]] to <16 x i32>
-// CHECK:   [[SHUFFLE8_I:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP62:%.*]] = bitcast <16 x i32> [[SHUFFLE8_I]] to <8 x i64>
-// CHECK:   [[TMP63:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = bitcast <8 x i64> [[TMP63]] to <16 x i32>
-// CHECK:   [[TMP65:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP66:%.*]] = bitcast <8 x i64> [[TMP65]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP66]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP67:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP62]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[TMP67]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP68:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP69:%.*]] = bitcast <8 x i64> [[TMP68]] to <16 x i32>
-// CHECK:   [[TMP70:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP71:%.*]] = bitcast <8 x i64> [[TMP70]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP72:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP73:%.*]] = bitcast <8 x i64> [[TMP72]] to <16 x i32>
-// CHECK:   [[TMP74:%.*]] = icmp ugt <16 x i32> [[TMP69]], [[TMP71]]
-// CHECK:   [[TMP75:%.*]] = select <16 x i1> [[TMP74]], <16 x i32> [[TMP69]], <16 x i32> [[TMP71]]
-// CHECK:   [[TMP76:%.*]] = bitcast <16 x i32> [[TMP75]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP76]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP77:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP77]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt <8 x i32> [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
+// CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt <4 x i32> [[TMP17]], [[TMP19]]
+// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp ugt <4 x i32> [[TMP31]], [[TMP33]]
+// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = icmp ugt <4 x i32> [[TMP45]], [[TMP47]]
+// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP49]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP51]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 unsigned int test_mm512_reduce_max_epu32(__m512i __W){
   return _mm512_reduce_max_epu32(__W); 
 }
 
 // CHECK-LABEL: define float @test_mm512_reduce_max_ps(<16 x float> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I17_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I18_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I19_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I14_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I15_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I16_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[A_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK:   store <16 x float> %__W, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   store <16 x float> [[TMP0]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP4]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE_I]], <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE1_I]], <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP5]], <16 x float> [[TMP6]], <16 x float> [[TMP7]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP8]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP11:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> [[TMP12]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE2_I]], <16 x float>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE3_I]], <16 x float>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x float> [[TMP15]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP16]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP17:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <16 x float> [[TMP17]], <16 x float> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP19:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE5_I]], <16 x float>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE6_I]], <16 x float>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP24:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x float> [[TMP23]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP24]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE8_I:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> [[TMP26]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP27:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x float> [[TMP27]], <16 x float> [[TMP28]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE8_I]], <16 x float>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE9_I]], <16 x float>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP31:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP29]], <16 x float> [[TMP30]], <16 x float> [[TMP31]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP32]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP33:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <16 x float> [[TMP33]], i32 0
-// CHECK:   ret float [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP0]], <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x float> [[TMP1]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP3]], <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[EXTRACT2_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP6]], <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[__A_ADDR_I14_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP8]], <8 x float>* [[__B_ADDR_I15_I]], align 32
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I14_I]], align 32
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I15_I]], align 32
+// CHECK-NEXT:    [[TMP11:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[TMP9]], <8 x float> [[TMP10]]) #2
+// CHECK-NEXT:    store <8 x float> [[TMP11]], <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT4_I]], <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT5_I]], <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP16]], <4 x float> [[TMP17]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP18]], <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP22]], <4 x float>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP23]], <4 x float> [[TMP24]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP25]], <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> [[TMP27]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE8_I]], <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP28]], <4 x float>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP29]], <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP32:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP30]], <4 x float> [[TMP31]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP32]], <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP33]], i32 0
+// CHECK-NEXT:    ret float [[VECEXT_I]]
 float test_mm512_reduce_max_ps(__m512 __W){
   return _mm512_reduce_max_ps(__W); 
 }
 
 // CHECK-LABEL: define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i64> [[TMP6]] to <16 x i32>
-// CHECK:   [[TMP8:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i64> [[TMP8]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP5]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP10]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i64> [[TMP11]] to <16 x i32>
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i64> [[TMP15]] to <16 x i32>
-// CHECK:   [[TMP17:%.*]] = icmp slt <16 x i32> [[TMP12]], [[TMP14]]
-// CHECK:   [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP12]], <16 x i32> [[TMP14]]
-// CHECK:   [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP19]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP21:%.*]] = bitcast <8 x i64> [[TMP20]] to <16 x i32>
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = bitcast <8 x i64> [[TMP22]] to <16 x i32>
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <16 x i32> [[TMP21]], <16 x i32> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP24:%.*]] = bitcast <16 x i32> [[SHUFFLE2_I]] to <8 x i64>
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = bitcast <8 x i64> [[TMP27]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> [[TMP28]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP29:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP24]], <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   [[TMP31:%.*]] = bitcast <8 x i64> [[TMP30]] to <16 x i32>
-// CHECK:   [[TMP32:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP33:%.*]] = bitcast <8 x i64> [[TMP32]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP35:%.*]] = bitcast <8 x i64> [[TMP34]] to <16 x i32>
-// CHECK:   [[TMP36:%.*]] = icmp slt <16 x i32> [[TMP31]], [[TMP33]]
-// CHECK:   [[TMP37:%.*]] = select <16 x i1> [[TMP36]], <16 x i32> [[TMP31]], <16 x i32> [[TMP33]]
-// CHECK:   [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP38]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP40:%.*]] = bitcast <8 x i64> [[TMP39]] to <16 x i32>
-// CHECK:   [[TMP41:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = bitcast <8 x i64> [[TMP41]] to <16 x i32>
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <16 x i32> [[TMP40]], <16 x i32> [[TMP42]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP43:%.*]] = bitcast <16 x i32> [[SHUFFLE5_I]] to <8 x i64>
-// CHECK:   [[TMP44:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = bitcast <8 x i64> [[TMP44]] to <16 x i32>
-// CHECK:   [[TMP46:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP47:%.*]] = bitcast <8 x i64> [[TMP46]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP47]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP48:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP43]], <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   store <8 x i64> [[TMP48]], <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP49:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   [[TMP50:%.*]] = bitcast <8 x i64> [[TMP49]] to <16 x i32>
-// CHECK:   [[TMP51:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP52:%.*]] = bitcast <8 x i64> [[TMP51]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP53:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP54:%.*]] = bitcast <8 x i64> [[TMP53]] to <16 x i32>
-// CHECK:   [[TMP55:%.*]] = icmp slt <16 x i32> [[TMP50]], [[TMP52]]
-// CHECK:   [[TMP56:%.*]] = select <16 x i1> [[TMP55]], <16 x i32> [[TMP50]], <16 x i32> [[TMP52]]
-// CHECK:   [[TMP57:%.*]] = bitcast <16 x i32> [[TMP56]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP57]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP58:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP59:%.*]] = bitcast <8 x i64> [[TMP58]] to <16 x i32>
-// CHECK:   [[TMP60:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP61:%.*]] = bitcast <8 x i64> [[TMP60]] to <16 x i32>
-// CHECK:   [[SHUFFLE8_I:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP62:%.*]] = bitcast <16 x i32> [[SHUFFLE8_I]] to <8 x i64>
-// CHECK:   [[TMP63:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = bitcast <8 x i64> [[TMP63]] to <16 x i32>
-// CHECK:   [[TMP65:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP66:%.*]] = bitcast <8 x i64> [[TMP65]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP66]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP67:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP62]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[TMP67]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP68:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP69:%.*]] = bitcast <8 x i64> [[TMP68]] to <16 x i32>
-// CHECK:   [[TMP70:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP71:%.*]] = bitcast <8 x i64> [[TMP70]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP72:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP73:%.*]] = bitcast <8 x i64> [[TMP72]] to <16 x i32>
-// CHECK:   [[TMP74:%.*]] = icmp slt <16 x i32> [[TMP69]], [[TMP71]]
-// CHECK:   [[TMP75:%.*]] = select <16 x i1> [[TMP74]], <16 x i32> [[TMP69]], <16 x i32> [[TMP71]]
-// CHECK:   [[TMP76:%.*]] = bitcast <16 x i32> [[TMP75]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP76]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP77:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP77]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
+// CHECK-NEXT:    [[TMP20:%.*]] = icmp slt <4 x i32> [[TMP17]], [[TMP19]]
+// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp slt <4 x i32> [[TMP31]], [[TMP33]]
+// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = icmp slt <4 x i32> [[TMP45]], [[TMP47]]
+// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP49]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP51]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 int test_mm512_reduce_min_epi32(__m512i __W){
   return _mm512_reduce_min_epi32(__W);
 }
 
 // CHECK-LABEL: define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store <8 x i64> [[TMP0]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP6:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i64> [[TMP6]] to <16 x i32>
-// CHECK:   [[TMP8:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i64> [[TMP8]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP5]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP10]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP11:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i64> [[TMP11]] to <16 x i32>
-// CHECK:   [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i64> [[TMP15]] to <16 x i32>
-// CHECK:   [[TMP17:%.*]] = icmp ult <16 x i32> [[TMP12]], [[TMP14]]
-// CHECK:   [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP12]], <16 x i32> [[TMP14]]
-// CHECK:   [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP19]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP21:%.*]] = bitcast <8 x i64> [[TMP20]] to <16 x i32>
-// CHECK:   [[TMP22:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = bitcast <8 x i64> [[TMP22]] to <16 x i32>
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <16 x i32> [[TMP21]], <16 x i32> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP24:%.*]] = bitcast <16 x i32> [[SHUFFLE2_I]] to <8 x i64>
-// CHECK:   [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
-// CHECK:   [[TMP27:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = bitcast <8 x i64> [[TMP27]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> [[TMP28]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP29:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP24]], <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   store <8 x i64> [[TMP29]], <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   [[TMP31:%.*]] = bitcast <8 x i64> [[TMP30]] to <16 x i32>
-// CHECK:   [[TMP32:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP33:%.*]] = bitcast <8 x i64> [[TMP32]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP35:%.*]] = bitcast <8 x i64> [[TMP34]] to <16 x i32>
-// CHECK:   [[TMP36:%.*]] = icmp ult <16 x i32> [[TMP31]], [[TMP33]]
-// CHECK:   [[TMP37:%.*]] = select <16 x i1> [[TMP36]], <16 x i32> [[TMP31]], <16 x i32> [[TMP33]]
-// CHECK:   [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP38]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP40:%.*]] = bitcast <8 x i64> [[TMP39]] to <16 x i32>
-// CHECK:   [[TMP41:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = bitcast <8 x i64> [[TMP41]] to <16 x i32>
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <16 x i32> [[TMP40]], <16 x i32> [[TMP42]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP43:%.*]] = bitcast <16 x i32> [[SHUFFLE5_I]] to <8 x i64>
-// CHECK:   [[TMP44:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = bitcast <8 x i64> [[TMP44]] to <16 x i32>
-// CHECK:   [[TMP46:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP47:%.*]] = bitcast <8 x i64> [[TMP46]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP47]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP48:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP43]], <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   store <8 x i64> [[TMP48]], <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP49:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   [[TMP50:%.*]] = bitcast <8 x i64> [[TMP49]] to <16 x i32>
-// CHECK:   [[TMP51:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP52:%.*]] = bitcast <8 x i64> [[TMP51]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP53:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP54:%.*]] = bitcast <8 x i64> [[TMP53]] to <16 x i32>
-// CHECK:   [[TMP55:%.*]] = icmp ult <16 x i32> [[TMP50]], [[TMP52]]
-// CHECK:   [[TMP56:%.*]] = select <16 x i1> [[TMP55]], <16 x i32> [[TMP50]], <16 x i32> [[TMP52]]
-// CHECK:   [[TMP57:%.*]] = bitcast <16 x i32> [[TMP56]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP57]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP58:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP59:%.*]] = bitcast <8 x i64> [[TMP58]] to <16 x i32>
-// CHECK:   [[TMP60:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP61:%.*]] = bitcast <8 x i64> [[TMP60]] to <16 x i32>
-// CHECK:   [[SHUFFLE8_I:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP62:%.*]] = bitcast <16 x i32> [[SHUFFLE8_I]] to <8 x i64>
-// CHECK:   [[TMP63:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = bitcast <8 x i64> [[TMP63]] to <16 x i32>
-// CHECK:   [[TMP65:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP66:%.*]] = bitcast <8 x i64> [[TMP65]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP66]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP67:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP62]], <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <8 x i64> [[TMP67]], <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP68:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP69:%.*]] = bitcast <8 x i64> [[TMP68]] to <16 x i32>
-// CHECK:   [[TMP70:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP71:%.*]] = bitcast <8 x i64> [[TMP70]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP72:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP73:%.*]] = bitcast <8 x i64> [[TMP72]] to <16 x i32>
-// CHECK:   [[TMP74:%.*]] = icmp ult <16 x i32> [[TMP69]], [[TMP71]]
-// CHECK:   [[TMP75:%.*]] = select <16 x i1> [[TMP74]], <16 x i32> [[TMP69]], <16 x i32> [[TMP71]]
-// CHECK:   [[TMP76:%.*]] = bitcast <16 x i32> [[TMP75]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP76]], <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP77:%.*]] = load <8 x i64>, <8 x i64>* [[A_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP77]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I10_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT2_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP5]] to <8 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP7]] to <8 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <8 x i32> [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> [[TMP6]], <8 x i32> [[TMP8]]
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP10]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP11]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT4_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32>
+// CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP18]] to <4 x i32>
+// CHECK-NEXT:    [[TMP20:%.*]] = icmp ult <4 x i32> [[TMP17]], [[TMP19]]
+// CHECK-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP20]], <4 x i32> [[TMP17]], <4 x i32> [[TMP19]]
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[TMP21]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP22]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i64> [[TMP23]] to <4 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <2 x i64> [[TMP25]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> [[TMP26]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP27]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP28]], <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP29]], <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i64> [[TMP30]] to <4 x i32>
+// CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i64> [[TMP32]] to <4 x i32>
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp ult <4 x i32> [[TMP31]], [[TMP33]]
+// CHECK-NEXT:    [[TMP35:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP31]], <4 x i32> [[TMP33]]
+// CHECK-NEXT:    [[TMP36:%.*]] = bitcast <4 x i32> [[TMP35]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP36]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <2 x i64> [[TMP37]] to <4 x i32>
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i64> [[TMP39]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP40]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP41:%.*]] = bitcast <4 x i32> [[SHUFFLE8_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP41]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP42]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = bitcast <2 x i64> [[TMP44]] to <4 x i32>
+// CHECK-NEXT:    [[TMP46:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <2 x i64> [[TMP46]] to <4 x i32>
+// CHECK-NEXT:    [[TMP48:%.*]] = icmp ult <4 x i32> [[TMP45]], [[TMP47]]
+// CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[TMP45]], <4 x i32> [[TMP47]]
+// CHECK-NEXT:    [[TMP50:%.*]] = bitcast <4 x i32> [[TMP49]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP49]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP51]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 unsigned int test_mm512_reduce_min_epu32(__m512i __W){
   return _mm512_reduce_min_epu32(__W); 
 }
 
 // CHECK-LABEL: define float @test_mm512_reduce_min_ps(<16 x float> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I17_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I18_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I19_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I14_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I15_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I16_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I11_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I12_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I13_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[A_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK:   store <16 x float> %__W, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   store <16 x float> [[TMP0]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP4]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE_I]], <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE1_I]], <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP5:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP6:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP5]], <16 x float> [[TMP6]], <16 x float> [[TMP7]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP8]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP10:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE2_I:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP11:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP12:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> [[TMP12]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE2_I]], <16 x float>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE3_I]], <16 x float>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   [[TMP13:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I18_I]], align 64
-// CHECK:   [[TMP14:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I19_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP15:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I17_I]], align 64
-// CHECK:   [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x float> [[TMP15]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP16]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP17:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP18:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE5_I:%.*]] = shufflevector <16 x float> [[TMP17]], <16 x float> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP19:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE5_I]], <16 x float>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE6_I]], <16 x float>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I15_I]], align 64
-// CHECK:   [[TMP22:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I16_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I14_I]], align 64
-// CHECK:   [[TMP24:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x float> [[TMP23]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP24]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP25:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE8_I:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> [[TMP26]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP27:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x float> [[TMP27]], <16 x float> [[TMP28]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE8_I]], <16 x float>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE9_I]], <16 x float>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I12_I]], align 64
-// CHECK:   [[TMP30:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I13_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP31:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I11_I]], align 64
-// CHECK:   [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP29]], <16 x float> [[TMP30]], <16 x float> [[TMP31]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP32]], <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[TMP33:%.*]] = load <16 x float>, <16 x float>* [[A_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <16 x float> [[TMP33]], i32 0
-// CHECK:   ret float [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I10_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I11_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP0]], <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x float> [[TMP1]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP3]], <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT2_I:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[EXTRACT2_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP6]], <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[__A_ADDR_I14_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP8]], <8 x float>* [[__B_ADDR_I15_I]], align 32
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I14_I]], align 32
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I15_I]], align 32
+// CHECK-NEXT:    [[TMP11:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[TMP9]], <8 x float> [[TMP10]]) #2
+// CHECK-NEXT:    store <8 x float> [[TMP11]], <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT4_I]], <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT5_I]], <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP16:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP16]], <4 x float> [[TMP17]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP18]], <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP19:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP22]], <4 x float>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I10_I]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP23]], <4 x float> [[TMP24]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP25]], <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE8_I:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> [[TMP27]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE8_I]], <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP29:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP28]], <4 x float>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP29]], <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP31:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP32:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP30]], <4 x float> [[TMP31]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP32]], <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP33]], i32 0
+// CHECK-NEXT:    ret float [[VECEXT_I]]
 float test_mm512_reduce_min_ps(__m512 __W){
   return _mm512_reduce_min_ps(__W); 
 }
 
 // CHECK-LABEL: define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I20_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i16 %__M, i16* [[__M_ADDR]], align 2
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   store i32 -2147483648, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP6]], i32 1
-// CHECK:   [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP7]], i32 2
-// CHECK:   [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP8]], i32 3
-// CHECK:   [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP9]], i32 4
-// CHECK:   [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP10]], i32 5
-// CHECK:   [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP11]], i32 6
-// CHECK:   [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP12]], i32 7
-// CHECK:   [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP13]], i32 8
-// CHECK:   [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP14]], i32 9
-// CHECK:   [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP15]], i32 10
-// CHECK:   [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP16]], i32 11
-// CHECK:   [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP17]], i32 12
-// CHECK:   [[TMP18:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP18]], i32 13
-// CHECK:   [[TMP19:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP19]], i32 14
-// CHECK:   [[TMP20:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP20]], i32 15
-// CHECK:   store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <16 x i32>, <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP22:%.*]] = bitcast <16 x i32> [[TMP21]] to <8 x i64>
-// CHECK:   [[TMP23:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-// CHECK:   [[TMP24:%.*]] = select <16 x i1> [[TMP23]], <16 x i32> [[TMP4]], <16 x i32> [[TMP21]]
-// CHECK:   [[TMP25:%.*]] = bitcast <16 x i32> [[TMP24]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP25]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = bitcast <8 x i64> [[TMP26]] to <16 x i32>
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP29:%.*]] = bitcast <8 x i64> [[TMP28]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP27]], <16 x i32> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP30:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = bitcast <8 x i64> [[TMP31]] to <16 x i32>
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = bitcast <8 x i64> [[TMP33]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP32]], <16 x i32> [[TMP34]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP30]], <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   store <8 x i64> [[TMP35]], <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   [[TMP37:%.*]] = bitcast <8 x i64> [[TMP36]] to <16 x i32>
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP39:%.*]] = bitcast <8 x i64> [[TMP38]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP40:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP41:%.*]] = bitcast <8 x i64> [[TMP40]] to <16 x i32>
-// CHECK:   [[TMP42:%.*]] = icmp sgt <16 x i32> [[TMP37]], [[TMP39]]
-// CHECK:   [[TMP43:%.*]] = select <16 x i1> [[TMP42]], <16 x i32> [[TMP37]], <16 x i32> [[TMP39]]
-// CHECK:   [[TMP44:%.*]] = bitcast <16 x i32> [[TMP43]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP44]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP46:%.*]] = bitcast <8 x i64> [[TMP45]] to <16 x i32>
-// CHECK:   [[TMP47:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP48:%.*]] = bitcast <8 x i64> [[TMP47]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP46]], <16 x i32> [[TMP48]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP49:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   [[TMP50:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP51:%.*]] = bitcast <8 x i64> [[TMP50]] to <16 x i32>
-// CHECK:   [[TMP52:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP53:%.*]] = bitcast <8 x i64> [[TMP52]] to <16 x i32>
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP53]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP54:%.*]] = bitcast <16 x i32> [[SHUFFLE4_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP49]], <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   store <8 x i64> [[TMP54]], <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP55:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   [[TMP56:%.*]] = bitcast <8 x i64> [[TMP55]] to <16 x i32>
-// CHECK:   [[TMP57:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP58:%.*]] = bitcast <8 x i64> [[TMP57]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP59:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP60:%.*]] = bitcast <8 x i64> [[TMP59]] to <16 x i32>
-// CHECK:   [[TMP61:%.*]] = icmp sgt <16 x i32> [[TMP56]], [[TMP58]]
-// CHECK:   [[TMP62:%.*]] = select <16 x i1> [[TMP61]], <16 x i32> [[TMP56]], <16 x i32> [[TMP58]]
-// CHECK:   [[TMP63:%.*]] = bitcast <16 x i32> [[TMP62]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP63]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP65:%.*]] = bitcast <8 x i64> [[TMP64]] to <16 x i32>
-// CHECK:   [[TMP66:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP67:%.*]] = bitcast <8 x i64> [[TMP66]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP68:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   [[TMP69:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP70:%.*]] = bitcast <8 x i64> [[TMP69]] to <16 x i32>
-// CHECK:   [[TMP71:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP72:%.*]] = bitcast <8 x i64> [[TMP71]] to <16 x i32>
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP72]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP73:%.*]] = bitcast <16 x i32> [[SHUFFLE7_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP68]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[TMP73]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP74:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP75:%.*]] = bitcast <8 x i64> [[TMP74]] to <16 x i32>
-// CHECK:   [[TMP76:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP77:%.*]] = bitcast <8 x i64> [[TMP76]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP78:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP79:%.*]] = bitcast <8 x i64> [[TMP78]] to <16 x i32>
-// CHECK:   [[TMP80:%.*]] = icmp sgt <16 x i32> [[TMP75]], [[TMP77]]
-// CHECK:   [[TMP81:%.*]] = select <16 x i1> [[TMP80]], <16 x i32> [[TMP75]], <16 x i32> [[TMP77]]
-// CHECK:   [[TMP82:%.*]] = bitcast <16 x i32> [[TMP81]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP82]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP83:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP84:%.*]] = bitcast <8 x i64> [[TMP83]] to <16 x i32>
-// CHECK:   [[TMP85:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP86:%.*]] = bitcast <8 x i64> [[TMP85]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP84]], <16 x i32> [[TMP86]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP87:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   [[TMP88:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP89:%.*]] = bitcast <8 x i64> [[TMP88]] to <16 x i32>
-// CHECK:   [[TMP90:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP91:%.*]] = bitcast <8 x i64> [[TMP90]] to <16 x i32>
-// CHECK:   [[SHUFFLE10_I:%.*]] = shufflevector <16 x i32> [[TMP89]], <16 x i32> [[TMP91]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP92:%.*]] = bitcast <16 x i32> [[SHUFFLE10_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP87]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP92]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP93:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP94:%.*]] = bitcast <8 x i64> [[TMP93]] to <16 x i32>
-// CHECK:   [[TMP95:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP96:%.*]] = bitcast <8 x i64> [[TMP95]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP97:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP98:%.*]] = bitcast <8 x i64> [[TMP97]] to <16 x i32>
-// CHECK:   [[TMP99:%.*]] = icmp sgt <16 x i32> [[TMP94]], [[TMP96]]
-// CHECK:   [[TMP100:%.*]] = select <16 x i1> [[TMP99]], <16 x i32> [[TMP94]], <16 x i32> [[TMP96]]
-// CHECK:   [[TMP101:%.*]] = bitcast <16 x i32> [[TMP100]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP101]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP102:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP102]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I15_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 -2147483648, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP9]], i32 7
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP10]], i32 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP11]], i32 9
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP12]], i32 10
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP13]], i32 11
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP14]], i32 12
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP15]], i32 13
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP16]], i32 14
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP17]], i32 15
+// CHECK-NEXT:    store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i32>, <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
+// CHECK-NEXT:    [[TMP20:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP20]], i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__A2_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A2_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP23]] to <16 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast i16 [[TMP22]] to <16 x i1>
+// CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> [[TMP24]], <16 x i32> [[TMP26]]
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP28]] to <8 x i64>
+// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP30]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x i64> [[TMP31]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT4_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP32]], <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP33]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i64>, <4 x i64>* [[__A_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <4 x i64> [[TMP34]] to <8 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i64> [[TMP36]] to <8 x i32>
+// CHECK-NEXT:    [[TMP38:%.*]] = icmp sgt <8 x i32> [[TMP35]], [[TMP37]]
+// CHECK-NEXT:    [[TMP39:%.*]] = select <8 x i1> [[TMP38]], <8 x i32> [[TMP35]], <8 x i32> [[TMP37]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP39]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP40]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP41]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x i64> [[TMP42]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT7_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V1_ADDR_I14_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* [[__V2_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I14_I]], align 16
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP45]] to <4 x i32>
+// CHECK-NEXT:    [[TMP47:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i64> [[TMP47]] to <4 x i32>
+// CHECK-NEXT:    [[TMP49:%.*]] = icmp sgt <4 x i32> [[TMP46]], [[TMP48]]
+// CHECK-NEXT:    [[TMP50:%.*]] = select <4 x i1> [[TMP49]], <4 x i32> [[TMP46]], <4 x i32> [[TMP48]]
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <4 x i32> [[TMP50]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP51]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP52:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP52]] to <4 x i32>
+// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP54]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP55]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP56]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP57]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP58]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP59:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP60:%.*]] = bitcast <2 x i64> [[TMP59]] to <4 x i32>
+// CHECK-NEXT:    [[TMP61:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i64> [[TMP61]] to <4 x i32>
+// CHECK-NEXT:    [[TMP63:%.*]] = icmp sgt <4 x i32> [[TMP60]], [[TMP62]]
+// CHECK-NEXT:    [[TMP64:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP60]], <4 x i32> [[TMP62]]
+// CHECK-NEXT:    [[TMP65:%.*]] = bitcast <4 x i32> [[TMP64]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP65]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP67:%.*]] = bitcast <2 x i64> [[TMP66]] to <4 x i32>
+// CHECK-NEXT:    [[TMP68:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP69:%.*]] = bitcast <2 x i64> [[TMP68]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> [[TMP69]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP70:%.*]] = bitcast <4 x i32> [[SHUFFLE10_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP70]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP71:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP72:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP71]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP72]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP73:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP74:%.*]] = bitcast <2 x i64> [[TMP73]] to <4 x i32>
+// CHECK-NEXT:    [[TMP75:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <2 x i64> [[TMP75]] to <4 x i32>
+// CHECK-NEXT:    [[TMP77:%.*]] = icmp sgt <4 x i32> [[TMP74]], [[TMP76]]
+// CHECK-NEXT:    [[TMP78:%.*]] = select <4 x i1> [[TMP77]], <4 x i32> [[TMP74]], <4 x i32> [[TMP76]]
+// CHECK-NEXT:    [[TMP79:%.*]] = bitcast <4 x i32> [[TMP78]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP78]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP80:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP80]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
   return _mm512_mask_reduce_max_epi32(__M, __W); 
 }
 
 // CHECK-LABEL: define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I20_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i16 %__M, i16* [[__M_ADDR]], align 2
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   store i32 0, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP6]], i32 1
-// CHECK:   [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP7]], i32 2
-// CHECK:   [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP8]], i32 3
-// CHECK:   [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP9]], i32 4
-// CHECK:   [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP10]], i32 5
-// CHECK:   [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP11]], i32 6
-// CHECK:   [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP12]], i32 7
-// CHECK:   [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP13]], i32 8
-// CHECK:   [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP14]], i32 9
-// CHECK:   [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP15]], i32 10
-// CHECK:   [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP16]], i32 11
-// CHECK:   [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP17]], i32 12
-// CHECK:   [[TMP18:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP18]], i32 13
-// CHECK:   [[TMP19:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP19]], i32 14
-// CHECK:   [[TMP20:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP20]], i32 15
-// CHECK:   store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <16 x i32>, <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP22:%.*]] = bitcast <16 x i32> [[TMP21]] to <8 x i64>
-// CHECK:   [[TMP23:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-// CHECK:   [[TMP24:%.*]] = select <16 x i1> [[TMP23]], <16 x i32> [[TMP4]], <16 x i32> [[TMP21]]
-// CHECK:   [[TMP25:%.*]] = bitcast <16 x i32> [[TMP24]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP25]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = bitcast <8 x i64> [[TMP26]] to <16 x i32>
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP29:%.*]] = bitcast <8 x i64> [[TMP28]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP27]], <16 x i32> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP30:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = bitcast <8 x i64> [[TMP31]] to <16 x i32>
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = bitcast <8 x i64> [[TMP33]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP32]], <16 x i32> [[TMP34]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP30]], <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   store <8 x i64> [[TMP35]], <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   [[TMP37:%.*]] = bitcast <8 x i64> [[TMP36]] to <16 x i32>
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP39:%.*]] = bitcast <8 x i64> [[TMP38]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP40:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP41:%.*]] = bitcast <8 x i64> [[TMP40]] to <16 x i32>
-// CHECK:   [[TMP42:%.*]] = icmp ugt <16 x i32> [[TMP37]], [[TMP39]]
-// CHECK:   [[TMP43:%.*]] = select <16 x i1> [[TMP42]], <16 x i32> [[TMP37]], <16 x i32> [[TMP39]]
-// CHECK:   [[TMP44:%.*]] = bitcast <16 x i32> [[TMP43]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP44]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP46:%.*]] = bitcast <8 x i64> [[TMP45]] to <16 x i32>
-// CHECK:   [[TMP47:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP48:%.*]] = bitcast <8 x i64> [[TMP47]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP46]], <16 x i32> [[TMP48]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP49:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   [[TMP50:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP51:%.*]] = bitcast <8 x i64> [[TMP50]] to <16 x i32>
-// CHECK:   [[TMP52:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP53:%.*]] = bitcast <8 x i64> [[TMP52]] to <16 x i32>
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP53]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP54:%.*]] = bitcast <16 x i32> [[SHUFFLE4_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP49]], <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   store <8 x i64> [[TMP54]], <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP55:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   [[TMP56:%.*]] = bitcast <8 x i64> [[TMP55]] to <16 x i32>
-// CHECK:   [[TMP57:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP58:%.*]] = bitcast <8 x i64> [[TMP57]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP59:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP60:%.*]] = bitcast <8 x i64> [[TMP59]] to <16 x i32>
-// CHECK:   [[TMP61:%.*]] = icmp ugt <16 x i32> [[TMP56]], [[TMP58]]
-// CHECK:   [[TMP62:%.*]] = select <16 x i1> [[TMP61]], <16 x i32> [[TMP56]], <16 x i32> [[TMP58]]
-// CHECK:   [[TMP63:%.*]] = bitcast <16 x i32> [[TMP62]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP63]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP65:%.*]] = bitcast <8 x i64> [[TMP64]] to <16 x i32>
-// CHECK:   [[TMP66:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP67:%.*]] = bitcast <8 x i64> [[TMP66]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP68:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   [[TMP69:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP70:%.*]] = bitcast <8 x i64> [[TMP69]] to <16 x i32>
-// CHECK:   [[TMP71:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP72:%.*]] = bitcast <8 x i64> [[TMP71]] to <16 x i32>
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP72]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP73:%.*]] = bitcast <16 x i32> [[SHUFFLE7_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP68]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[TMP73]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP74:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP75:%.*]] = bitcast <8 x i64> [[TMP74]] to <16 x i32>
-// CHECK:   [[TMP76:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP77:%.*]] = bitcast <8 x i64> [[TMP76]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP78:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP79:%.*]] = bitcast <8 x i64> [[TMP78]] to <16 x i32>
-// CHECK:   [[TMP80:%.*]] = icmp ugt <16 x i32> [[TMP75]], [[TMP77]]
-// CHECK:   [[TMP81:%.*]] = select <16 x i1> [[TMP80]], <16 x i32> [[TMP75]], <16 x i32> [[TMP77]]
-// CHECK:   [[TMP82:%.*]] = bitcast <16 x i32> [[TMP81]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP82]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP83:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP84:%.*]] = bitcast <8 x i64> [[TMP83]] to <16 x i32>
-// CHECK:   [[TMP85:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP86:%.*]] = bitcast <8 x i64> [[TMP85]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP84]], <16 x i32> [[TMP86]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP87:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   [[TMP88:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP89:%.*]] = bitcast <8 x i64> [[TMP88]] to <16 x i32>
-// CHECK:   [[TMP90:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP91:%.*]] = bitcast <8 x i64> [[TMP90]] to <16 x i32>
-// CHECK:   [[SHUFFLE10_I:%.*]] = shufflevector <16 x i32> [[TMP89]], <16 x i32> [[TMP91]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP92:%.*]] = bitcast <16 x i32> [[SHUFFLE10_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP87]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP92]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP93:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP94:%.*]] = bitcast <8 x i64> [[TMP93]] to <16 x i32>
-// CHECK:   [[TMP95:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP96:%.*]] = bitcast <8 x i64> [[TMP95]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP97:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP98:%.*]] = bitcast <8 x i64> [[TMP97]] to <16 x i32>
-// CHECK:   [[TMP99:%.*]] = icmp ugt <16 x i32> [[TMP94]], [[TMP96]]
-// CHECK:   [[TMP100:%.*]] = select <16 x i1> [[TMP99]], <16 x i32> [[TMP94]], <16 x i32> [[TMP96]]
-// CHECK:   [[TMP101:%.*]] = bitcast <16 x i32> [[TMP100]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP101]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP102:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP102]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I11_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP2]], i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP5]] to <16 x i32>
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, <8 x i64>* [[DOTCOMPOUNDLITERAL_I_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP7]] to <16 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP6]], <16 x i32> [[TMP8]]
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
+// CHECK-NEXT:    store <8 x i64> [[TMP11]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT3_I:%.*]] = shufflevector <8 x i64> [[TMP13]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT3_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP14]], <4 x i64>* [[__A2_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP15]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i64>, <4 x i64>* [[__A2_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i64> [[TMP16]] to <8 x i32>
+// CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i64> [[TMP18]] to <8 x i32>
+// CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt <8 x i32> [[TMP17]], [[TMP19]]
+// CHECK-NEXT:    [[TMP21:%.*]] = select <8 x i1> [[TMP20]], <8 x i32> [[TMP17]], <8 x i32> [[TMP19]]
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[TMP21]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP22]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT5_I:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT5_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP24]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP26:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP25]], <2 x i64>* [[__V1_ADDR_I13_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP26]], <2 x i64>* [[__V2_ADDR_I14_I]], align 16
+// CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <2 x i64> [[TMP27]] to <4 x i32>
+// CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I14_I]], align 16
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <2 x i64> [[TMP29]] to <4 x i32>
+// CHECK-NEXT:    [[TMP31:%.*]] = icmp ugt <4 x i32> [[TMP28]], [[TMP30]]
+// CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[TMP28]], <4 x i32> [[TMP30]]
+// CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP32]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP33]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP34:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <2 x i64> [[TMP34]] to <4 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i64> [[TMP36]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP37]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP38]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP39:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP39]], <2 x i64>* [[__V1_ADDR_I11_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP40]], <2 x i64>* [[__V2_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP41:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I11_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = bitcast <2 x i64> [[TMP41]] to <4 x i32>
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i64> [[TMP43]] to <4 x i32>
+// CHECK-NEXT:    [[TMP45:%.*]] = icmp ugt <4 x i32> [[TMP42]], [[TMP44]]
+// CHECK-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[TMP42]], <4 x i32> [[TMP44]]
+// CHECK-NEXT:    [[TMP47:%.*]] = bitcast <4 x i32> [[TMP46]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP47]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP48:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP49:%.*]] = bitcast <2 x i64> [[TMP48]] to <4 x i32>
+// CHECK-NEXT:    [[TMP50:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <2 x i64> [[TMP50]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE9_I:%.*]] = shufflevector <4 x i32> [[TMP49]], <4 x i32> [[TMP51]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP52:%.*]] = bitcast <4 x i32> [[SHUFFLE9_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP52]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP53:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP53]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP54]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP55:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <2 x i64> [[TMP55]] to <4 x i32>
+// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = bitcast <2 x i64> [[TMP57]] to <4 x i32>
+// CHECK-NEXT:    [[TMP59:%.*]] = icmp ugt <4 x i32> [[TMP56]], [[TMP58]]
+// CHECK-NEXT:    [[TMP60:%.*]] = select <4 x i1> [[TMP59]], <4 x i32> [[TMP56]], <4 x i32> [[TMP58]]
+// CHECK-NEXT:    [[TMP61:%.*]] = bitcast <4 x i32> [[TMP60]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP62:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP62]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
   return _mm512_mask_reduce_max_epu32(__M, __W); 
 }
 
 // CHECK-LABEL: define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I18_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I19_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I20_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I15_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I16_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I17_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__W_ADDR_I_I:%.*]] = alloca float, align 4
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK:   [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK:   store i16 %__M, i16* [[__M_ADDR]], align 2
-// CHECK:   store <16 x float> %__W, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK:   [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK:   store <16 x float> [[TMP1]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK:   [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   store float 0x7FF0000000000000, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[TMP4:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <16 x float> undef, float [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <16 x float> [[VECINIT_I_I]], float [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <16 x float> [[VECINIT1_I_I]], float [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <16 x float> [[VECINIT2_I_I]], float [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <16 x float> [[VECINIT3_I_I]], float [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <16 x float> [[VECINIT4_I_I]], float [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <16 x float> [[VECINIT5_I_I]], float [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <16 x float> [[VECINIT6_I_I]], float [[TMP11]], i32 7
-// CHECK:   [[TMP12:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT8_I_I:%.*]] = insertelement <16 x float> [[VECINIT7_I_I]], float [[TMP12]], i32 8
-// CHECK:   [[TMP13:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT9_I_I:%.*]] = insertelement <16 x float> [[VECINIT8_I_I]], float [[TMP13]], i32 9
-// CHECK:   [[TMP14:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT10_I_I:%.*]] = insertelement <16 x float> [[VECINIT9_I_I]], float [[TMP14]], i32 10
-// CHECK:   [[TMP15:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT11_I_I:%.*]] = insertelement <16 x float> [[VECINIT10_I_I]], float [[TMP15]], i32 11
-// CHECK:   [[TMP16:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT12_I_I:%.*]] = insertelement <16 x float> [[VECINIT11_I_I]], float [[TMP16]], i32 12
-// CHECK:   [[TMP17:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT13_I_I:%.*]] = insertelement <16 x float> [[VECINIT12_I_I]], float [[TMP17]], i32 13
-// CHECK:   [[TMP18:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT14_I_I:%.*]] = insertelement <16 x float> [[VECINIT13_I_I]], float [[TMP18]], i32 14
-// CHECK:   [[TMP19:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT15_I_I:%.*]] = insertelement <16 x float> [[VECINIT14_I_I]], float [[TMP19]], i32 15
-// CHECK:   store <16 x float> [[VECINIT15_I_I]], <16 x float>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[SUB_I:%.*]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[TMP20]]
-// CHECK:   [[TMP21:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-// CHECK:   [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x float> [[TMP3]], <16 x float> [[SUB_I]]
-// CHECK:   store <16 x float> [[TMP22]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> [[TMP24]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP25:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> [[TMP26]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE_I]], <16 x float>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE1_I]], <16 x float>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP30:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP27]], <16 x float> [[TMP28]], <16 x float> [[TMP29]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP30]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP31:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x float> [[TMP31]], <16 x float> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP33:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <16 x float> [[TMP33]], <16 x float> [[TMP34]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE3_I]], <16 x float>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE4_I]], <16 x float>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP35:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP38:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP35]], <16 x float> [[TMP36]], <16 x float> [[TMP37]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP38]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP40:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP40]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP41:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <16 x float> [[TMP41]], <16 x float> [[TMP42]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE6_I]], <16 x float>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE7_I]], <16 x float>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP43:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP44:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP45:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP46:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP43]], <16 x float> [[TMP44]], <16 x float> [[TMP45]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP46]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP47:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP48:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x float> [[TMP47]], <16 x float> [[TMP48]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP49:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP50:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE10_I:%.*]] = shufflevector <16 x float> [[TMP49]], <16 x float> [[TMP50]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE9_I]], <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE10_I]], <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP51:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP52:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP53:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP54:%.*]] = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> [[TMP51]], <16 x float> [[TMP52]], <16 x float> [[TMP53]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP54]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP55:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <16 x float> [[TMP55]], i32 0
-// CHECK:   ret float [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I16_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I17_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x float> [[TMP1]], <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store float 0xFFF0000000000000, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x float> [[VECINIT_I_I]], float [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x float> [[VECINIT1_I_I]], float [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x float> [[VECINIT2_I_I]], float [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x float> [[VECINIT3_I_I]], float [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x float> [[VECINIT4_I_I]], float [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x float> [[VECINIT5_I_I]], float [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x float> [[VECINIT6_I_I]], float [[TMP9]], i32 7
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x float> [[VECINIT7_I_I]], float [[TMP10]], i32 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x float> [[VECINIT8_I_I]], float [[TMP11]], i32 9
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x float> [[VECINIT9_I_I]], float [[TMP12]], i32 10
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x float> [[VECINIT10_I_I]], float [[TMP13]], i32 11
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x float> [[VECINIT11_I_I]], float [[TMP14]], i32 12
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x float> [[VECINIT12_I_I]], float [[TMP15]], i32 13
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x float> [[VECINIT13_I_I]], float [[TMP16]], i32 14
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x float> [[VECINIT14_I_I]], float [[TMP17]], i32 15
+// CHECK-NEXT:    store <16 x float> [[VECINIT15_I_I]], <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x float>, <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP18]], <16 x float>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP19]], i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    store <16 x float> [[TMP20]], <16 x float>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast i16 [[TMP21]] to <16 x i1>
+// CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x float> [[TMP22]], <16 x float> [[TMP23]]
+// CHECK-NEXT:    store <16 x float> [[TMP25]], <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x float> [[TMP26]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP27]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP28]], <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x float> [[TMP29]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP30]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x double> [[EXTRACT4_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP31]], <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP32]], <8 x float>* [[__A_ADDR_I16_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP33]], <8 x float>* [[__B_ADDR_I17_I]], align 32
+// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I16_I]], align 32
+// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I17_I]], align 32
+// CHECK-NEXT:    [[TMP36:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[TMP34]], <8 x float> [[TMP35]]) #2
+// CHECK-NEXT:    store <8 x float> [[TMP36]], <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT6_I]], <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <8 x float> [[TMP38]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT7_I]], <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP39]], <4 x float>* [[__A_ADDR_I14_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP40]], <4 x float>* [[__B_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I14_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP41]], <4 x float> [[TMP42]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP43]], <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP44]], <4 x float> [[TMP45]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP46:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP47:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP46]], <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP47]], <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP48:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP49:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP50:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP48]], <4 x float> [[TMP49]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP50]], <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x float> [[TMP51]], <4 x float> [[TMP52]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE10_I]], <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP53:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP54:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP54]], <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP55:%.*]] = load <4 x float>, <4 x float>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP56:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[TMP55]], <4 x float> [[TMP56]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP57]], <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP58]], i32 0
+// CHECK-NEXT:    ret float [[VECEXT_I]]
 float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
   return _mm512_mask_reduce_max_ps(__M, __W); 
 }
 
 // CHECK-LABEL: define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I20_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i16 %__M, i16* [[__M_ADDR]], align 2
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   store i32 2147483647, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP6]], i32 1
-// CHECK:   [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP7]], i32 2
-// CHECK:   [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP8]], i32 3
-// CHECK:   [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP9]], i32 4
-// CHECK:   [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP10]], i32 5
-// CHECK:   [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP11]], i32 6
-// CHECK:   [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP12]], i32 7
-// CHECK:   [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP13]], i32 8
-// CHECK:   [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP14]], i32 9
-// CHECK:   [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP15]], i32 10
-// CHECK:   [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP16]], i32 11
-// CHECK:   [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP17]], i32 12
-// CHECK:   [[TMP18:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP18]], i32 13
-// CHECK:   [[TMP19:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP19]], i32 14
-// CHECK:   [[TMP20:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP20]], i32 15
-// CHECK:   store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <16 x i32>, <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP22:%.*]] = bitcast <16 x i32> [[TMP21]] to <8 x i64>
-// CHECK:   [[TMP23:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-// CHECK:   [[TMP24:%.*]] = select <16 x i1> [[TMP23]], <16 x i32> [[TMP4]], <16 x i32> [[TMP21]]
-// CHECK:   [[TMP25:%.*]] = bitcast <16 x i32> [[TMP24]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP25]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = bitcast <8 x i64> [[TMP26]] to <16 x i32>
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP29:%.*]] = bitcast <8 x i64> [[TMP28]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP27]], <16 x i32> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP30:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = bitcast <8 x i64> [[TMP31]] to <16 x i32>
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = bitcast <8 x i64> [[TMP33]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP32]], <16 x i32> [[TMP34]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP30]], <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   store <8 x i64> [[TMP35]], <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   [[TMP37:%.*]] = bitcast <8 x i64> [[TMP36]] to <16 x i32>
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP39:%.*]] = bitcast <8 x i64> [[TMP38]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP40:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP41:%.*]] = bitcast <8 x i64> [[TMP40]] to <16 x i32>
-// CHECK:   [[TMP42:%.*]] = icmp slt <16 x i32> [[TMP37]], [[TMP39]]
-// CHECK:   [[TMP43:%.*]] = select <16 x i1> [[TMP42]], <16 x i32> [[TMP37]], <16 x i32> [[TMP39]]
-// CHECK:   [[TMP44:%.*]] = bitcast <16 x i32> [[TMP43]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP44]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP46:%.*]] = bitcast <8 x i64> [[TMP45]] to <16 x i32>
-// CHECK:   [[TMP47:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP48:%.*]] = bitcast <8 x i64> [[TMP47]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP46]], <16 x i32> [[TMP48]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP49:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   [[TMP50:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP51:%.*]] = bitcast <8 x i64> [[TMP50]] to <16 x i32>
-// CHECK:   [[TMP52:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP53:%.*]] = bitcast <8 x i64> [[TMP52]] to <16 x i32>
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP53]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP54:%.*]] = bitcast <16 x i32> [[SHUFFLE4_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP49]], <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   store <8 x i64> [[TMP54]], <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP55:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   [[TMP56:%.*]] = bitcast <8 x i64> [[TMP55]] to <16 x i32>
-// CHECK:   [[TMP57:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP58:%.*]] = bitcast <8 x i64> [[TMP57]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP59:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP60:%.*]] = bitcast <8 x i64> [[TMP59]] to <16 x i32>
-// CHECK:   [[TMP61:%.*]] = icmp slt <16 x i32> [[TMP56]], [[TMP58]]
-// CHECK:   [[TMP62:%.*]] = select <16 x i1> [[TMP61]], <16 x i32> [[TMP56]], <16 x i32> [[TMP58]]
-// CHECK:   [[TMP63:%.*]] = bitcast <16 x i32> [[TMP62]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP63]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP65:%.*]] = bitcast <8 x i64> [[TMP64]] to <16 x i32>
-// CHECK:   [[TMP66:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP67:%.*]] = bitcast <8 x i64> [[TMP66]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP68:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   [[TMP69:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP70:%.*]] = bitcast <8 x i64> [[TMP69]] to <16 x i32>
-// CHECK:   [[TMP71:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP72:%.*]] = bitcast <8 x i64> [[TMP71]] to <16 x i32>
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP72]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP73:%.*]] = bitcast <16 x i32> [[SHUFFLE7_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP68]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[TMP73]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP74:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP75:%.*]] = bitcast <8 x i64> [[TMP74]] to <16 x i32>
-// CHECK:   [[TMP76:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP77:%.*]] = bitcast <8 x i64> [[TMP76]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP78:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP79:%.*]] = bitcast <8 x i64> [[TMP78]] to <16 x i32>
-// CHECK:   [[TMP80:%.*]] = icmp slt <16 x i32> [[TMP75]], [[TMP77]]
-// CHECK:   [[TMP81:%.*]] = select <16 x i1> [[TMP80]], <16 x i32> [[TMP75]], <16 x i32> [[TMP77]]
-// CHECK:   [[TMP82:%.*]] = bitcast <16 x i32> [[TMP81]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP82]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP83:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP84:%.*]] = bitcast <8 x i64> [[TMP83]] to <16 x i32>
-// CHECK:   [[TMP85:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP86:%.*]] = bitcast <8 x i64> [[TMP85]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP84]], <16 x i32> [[TMP86]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP87:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   [[TMP88:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP89:%.*]] = bitcast <8 x i64> [[TMP88]] to <16 x i32>
-// CHECK:   [[TMP90:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP91:%.*]] = bitcast <8 x i64> [[TMP90]] to <16 x i32>
-// CHECK:   [[SHUFFLE10_I:%.*]] = shufflevector <16 x i32> [[TMP89]], <16 x i32> [[TMP91]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP92:%.*]] = bitcast <16 x i32> [[SHUFFLE10_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP87]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP92]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP93:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP94:%.*]] = bitcast <8 x i64> [[TMP93]] to <16 x i32>
-// CHECK:   [[TMP95:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP96:%.*]] = bitcast <8 x i64> [[TMP95]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP97:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP98:%.*]] = bitcast <8 x i64> [[TMP97]] to <16 x i32>
-// CHECK:   [[TMP99:%.*]] = icmp slt <16 x i32> [[TMP94]], [[TMP96]]
-// CHECK:   [[TMP100:%.*]] = select <16 x i1> [[TMP99]], <16 x i32> [[TMP94]], <16 x i32> [[TMP96]]
-// CHECK:   [[TMP101:%.*]] = bitcast <16 x i32> [[TMP100]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP101]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP102:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP102]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I15_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 2147483647, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP9]], i32 7
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP10]], i32 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP11]], i32 9
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP12]], i32 10
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP13]], i32 11
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP14]], i32 12
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP15]], i32 13
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP16]], i32 14
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP17]], i32 15
+// CHECK-NEXT:    store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i32>, <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
+// CHECK-NEXT:    [[TMP20:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP20]], i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP23]] to <16 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast i16 [[TMP22]] to <16 x i1>
+// CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> [[TMP24]], <16 x i32> [[TMP26]]
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP28]] to <8 x i64>
+// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP30]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x i64> [[TMP31]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT4_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP32]], <4 x i64>* [[__A2_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP33]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i64>, <4 x i64>* [[__A2_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <4 x i64> [[TMP34]] to <8 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i64> [[TMP36]] to <8 x i32>
+// CHECK-NEXT:    [[TMP38:%.*]] = icmp slt <8 x i32> [[TMP35]], [[TMP37]]
+// CHECK-NEXT:    [[TMP39:%.*]] = select <8 x i1> [[TMP38]], <8 x i32> [[TMP35]], <8 x i32> [[TMP37]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP39]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP40]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP41]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x i64> [[TMP42]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT7_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V1_ADDR_I14_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* [[__V2_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I14_I]], align 16
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP45]] to <4 x i32>
+// CHECK-NEXT:    [[TMP47:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i64> [[TMP47]] to <4 x i32>
+// CHECK-NEXT:    [[TMP49:%.*]] = icmp slt <4 x i32> [[TMP46]], [[TMP48]]
+// CHECK-NEXT:    [[TMP50:%.*]] = select <4 x i1> [[TMP49]], <4 x i32> [[TMP46]], <4 x i32> [[TMP48]]
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <4 x i32> [[TMP50]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP51]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP52:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP52]] to <4 x i32>
+// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP54]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP55]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP56]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP57]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP58]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP59:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP60:%.*]] = bitcast <2 x i64> [[TMP59]] to <4 x i32>
+// CHECK-NEXT:    [[TMP61:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i64> [[TMP61]] to <4 x i32>
+// CHECK-NEXT:    [[TMP63:%.*]] = icmp slt <4 x i32> [[TMP60]], [[TMP62]]
+// CHECK-NEXT:    [[TMP64:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP60]], <4 x i32> [[TMP62]]
+// CHECK-NEXT:    [[TMP65:%.*]] = bitcast <4 x i32> [[TMP64]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP65]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP67:%.*]] = bitcast <2 x i64> [[TMP66]] to <4 x i32>
+// CHECK-NEXT:    [[TMP68:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP69:%.*]] = bitcast <2 x i64> [[TMP68]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> [[TMP69]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP70:%.*]] = bitcast <4 x i32> [[SHUFFLE10_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP70]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP71:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP72:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP71]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP72]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP73:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP74:%.*]] = bitcast <2 x i64> [[TMP73]] to <4 x i32>
+// CHECK-NEXT:    [[TMP75:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <2 x i64> [[TMP75]] to <4 x i32>
+// CHECK-NEXT:    [[TMP77:%.*]] = icmp slt <4 x i32> [[TMP74]], [[TMP76]]
+// CHECK-NEXT:    [[TMP78:%.*]] = select <4 x i1> [[TMP77]], <4 x i32> [[TMP74]], <4 x i32> [[TMP76]]
+// CHECK-NEXT:    [[TMP79:%.*]] = bitcast <4 x i32> [[TMP78]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP78]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP80:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP80]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
   return _mm512_mask_reduce_min_epi32(__M, __W); 
 }
 
 // CHECK-LABEL: define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I18_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I19_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I20_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I15_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I16_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I17_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK:   [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
-// CHECK:   store i16 %__M, i16* [[__M_ADDR]], align 2
-// CHECK:   store <8 x i64> %__W, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK:   [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
-// CHECK:   store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK:   store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-// CHECK:   store i32 -1, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP6]], i32 1
-// CHECK:   [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP7]], i32 2
-// CHECK:   [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP8]], i32 3
-// CHECK:   [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP9]], i32 4
-// CHECK:   [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP10]], i32 5
-// CHECK:   [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP11]], i32 6
-// CHECK:   [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP12]], i32 7
-// CHECK:   [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP13]], i32 8
-// CHECK:   [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP14]], i32 9
-// CHECK:   [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP15]], i32 10
-// CHECK:   [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP16]], i32 11
-// CHECK:   [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP17]], i32 12
-// CHECK:   [[TMP18:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP18]], i32 13
-// CHECK:   [[TMP19:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP19]], i32 14
-// CHECK:   [[TMP20:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP20]], i32 15
-// CHECK:   store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP21:%.*]] = load <16 x i32>, <16 x i32>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP22:%.*]] = bitcast <16 x i32> [[TMP21]] to <8 x i64>
-// CHECK:   [[TMP23:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-// CHECK:   [[TMP24:%.*]] = select <16 x i1> [[TMP23]], <16 x i32> [[TMP4]], <16 x i32> [[TMP21]]
-// CHECK:   [[TMP25:%.*]] = bitcast <16 x i32> [[TMP24]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP25]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP27:%.*]] = bitcast <8 x i64> [[TMP26]] to <16 x i32>
-// CHECK:   [[TMP28:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP29:%.*]] = bitcast <8 x i64> [[TMP28]] to <16 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i32> [[TMP27]], <16 x i32> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP30:%.*]] = bitcast <16 x i32> [[SHUFFLE_I]] to <8 x i64>
-// CHECK:   [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = bitcast <8 x i64> [[TMP31]] to <16 x i32>
-// CHECK:   [[TMP33:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = bitcast <8 x i64> [[TMP33]] to <16 x i32>
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x i32> [[TMP32]], <16 x i32> [[TMP34]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP35:%.*]] = bitcast <16 x i32> [[SHUFFLE1_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP30]], <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   store <8 x i64> [[TMP35]], <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   [[TMP37:%.*]] = bitcast <8 x i64> [[TMP36]] to <16 x i32>
-// CHECK:   [[TMP38:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP39:%.*]] = bitcast <8 x i64> [[TMP38]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP40:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP41:%.*]] = bitcast <8 x i64> [[TMP40]] to <16 x i32>
-// CHECK:   [[TMP42:%.*]] = icmp ult <16 x i32> [[TMP37]], [[TMP39]]
-// CHECK:   [[TMP43:%.*]] = select <16 x i1> [[TMP42]], <16 x i32> [[TMP37]], <16 x i32> [[TMP39]]
-// CHECK:   [[TMP44:%.*]] = bitcast <16 x i32> [[TMP43]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP44]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP45:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP46:%.*]] = bitcast <8 x i64> [[TMP45]] to <16 x i32>
-// CHECK:   [[TMP47:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP48:%.*]] = bitcast <8 x i64> [[TMP47]] to <16 x i32>
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x i32> [[TMP46]], <16 x i32> [[TMP48]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP49:%.*]] = bitcast <16 x i32> [[SHUFFLE3_I]] to <8 x i64>
-// CHECK:   [[TMP50:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP51:%.*]] = bitcast <8 x i64> [[TMP50]] to <16 x i32>
-// CHECK:   [[TMP52:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP53:%.*]] = bitcast <8 x i64> [[TMP52]] to <16 x i32>
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP53]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP54:%.*]] = bitcast <16 x i32> [[SHUFFLE4_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP49]], <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   store <8 x i64> [[TMP54]], <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP55:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   [[TMP56:%.*]] = bitcast <8 x i64> [[TMP55]] to <16 x i32>
-// CHECK:   [[TMP57:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP58:%.*]] = bitcast <8 x i64> [[TMP57]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP59:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP60:%.*]] = bitcast <8 x i64> [[TMP59]] to <16 x i32>
-// CHECK:   [[TMP61:%.*]] = icmp ult <16 x i32> [[TMP56]], [[TMP58]]
-// CHECK:   [[TMP62:%.*]] = select <16 x i1> [[TMP61]], <16 x i32> [[TMP56]], <16 x i32> [[TMP58]]
-// CHECK:   [[TMP63:%.*]] = bitcast <16 x i32> [[TMP62]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP63]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP64:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP65:%.*]] = bitcast <8 x i64> [[TMP64]] to <16 x i32>
-// CHECK:   [[TMP66:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP67:%.*]] = bitcast <8 x i64> [[TMP66]] to <16 x i32>
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP68:%.*]] = bitcast <16 x i32> [[SHUFFLE6_I]] to <8 x i64>
-// CHECK:   [[TMP69:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP70:%.*]] = bitcast <8 x i64> [[TMP69]] to <16 x i32>
-// CHECK:   [[TMP71:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP72:%.*]] = bitcast <8 x i64> [[TMP71]] to <16 x i32>
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP72]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP73:%.*]] = bitcast <16 x i32> [[SHUFFLE7_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP68]], <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <8 x i64> [[TMP73]], <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP74:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP75:%.*]] = bitcast <8 x i64> [[TMP74]] to <16 x i32>
-// CHECK:   [[TMP76:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP77:%.*]] = bitcast <8 x i64> [[TMP76]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP78:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP79:%.*]] = bitcast <8 x i64> [[TMP78]] to <16 x i32>
-// CHECK:   [[TMP80:%.*]] = icmp ult <16 x i32> [[TMP75]], [[TMP77]]
-// CHECK:   [[TMP81:%.*]] = select <16 x i1> [[TMP80]], <16 x i32> [[TMP75]], <16 x i32> [[TMP77]]
-// CHECK:   [[TMP82:%.*]] = bitcast <16 x i32> [[TMP81]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP82]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP83:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP84:%.*]] = bitcast <8 x i64> [[TMP83]] to <16 x i32>
-// CHECK:   [[TMP85:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP86:%.*]] = bitcast <8 x i64> [[TMP85]] to <16 x i32>
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x i32> [[TMP84]], <16 x i32> [[TMP86]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP87:%.*]] = bitcast <16 x i32> [[SHUFFLE9_I]] to <8 x i64>
-// CHECK:   [[TMP88:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP89:%.*]] = bitcast <8 x i64> [[TMP88]] to <16 x i32>
-// CHECK:   [[TMP90:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP91:%.*]] = bitcast <8 x i64> [[TMP90]] to <16 x i32>
-// CHECK:   [[SHUFFLE10_I:%.*]] = shufflevector <16 x i32> [[TMP89]], <16 x i32> [[TMP91]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP92:%.*]] = bitcast <16 x i32> [[SHUFFLE10_I]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP87]], <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <8 x i64> [[TMP92]], <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP93:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP94:%.*]] = bitcast <8 x i64> [[TMP93]] to <16 x i32>
-// CHECK:   [[TMP95:%.*]] = load <8 x i64>, <8 x i64>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP96:%.*]] = bitcast <8 x i64> [[TMP95]] to <16 x i32>
-// CHECK:   store <8 x i64> zeroinitializer, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP97:%.*]] = load <8 x i64>, <8 x i64>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP98:%.*]] = bitcast <8 x i64> [[TMP97]] to <16 x i32>
-// CHECK:   [[TMP99:%.*]] = icmp ult <16 x i32> [[TMP94]], [[TMP96]]
-// CHECK:   [[TMP100:%.*]] = select <16 x i1> [[TMP99]], <16 x i32> [[TMP94]], <16 x i32> [[TMP96]]
-// CHECK:   [[TMP101:%.*]] = bitcast <16 x i32> [[TMP100]] to <8 x i64>
-// CHECK:   store <8 x i64> [[TMP101]], <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP102:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <8 x i64> [[TMP102]], i32 0
-// CHECK:   [[CONV_I:%.*]] = trunc i64 [[VECEXT_I]] to i32
-// CHECK:   ret i32 [[CONV_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__V1_ADDR_I14_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I15_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I12_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I13_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V1_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__V2_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__S_ADDR_I_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i32>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[__W:%.*]], <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 -1, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i32> [[VECINIT_I_I]], i32 [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I_I]], i32 [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I_I]], i32 [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I_I]], i32 [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I_I]], i32 [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I_I]], i32 [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I_I]], i32 [[TMP9]], i32 7
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I_I]], i32 [[TMP10]], i32 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I_I]], i32 [[TMP11]], i32 9
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I_I]], i32 [[TMP12]], i32 10
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I_I]], i32 [[TMP13]], i32 11
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I_I]], i32 [[TMP14]], i32 12
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I_I]], i32 [[TMP15]], i32 13
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I_I]], i32 [[TMP16]], i32 14
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[__S_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I_I]], i32 [[TMP17]], i32 15
+// CHECK-NEXT:    store <16 x i32> [[VECINIT15_I_I]], <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x i32>, <16 x i32>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i32> [[TMP18]] to <8 x i64>
+// CHECK-NEXT:    [[TMP20:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP19]], <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP20]], i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP21]], <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, <8 x i64>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP23]] to <16 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i64>, <8 x i64>* [[__W_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <16 x i32>
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast i16 [[TMP22]] to <16 x i1>
+// CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> [[TMP24]], <16 x i32> [[TMP26]]
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP28]] to <8 x i64>
+// CHECK-NEXT:    store <8 x i64> [[TMP29]], <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i64> [[TMP30]], <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT_I]], <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP31:%.*]] = load <8 x i64>, <8 x i64>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x i64> [[TMP31]], <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x i64> [[EXTRACT4_I]], <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i64>, <4 x i64>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i64>, <4 x i64>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP32]], <4 x i64>* [[__A2_ADDR_I_I]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP33]], <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i64>, <4 x i64>* [[__A2_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP35:%.*]] = bitcast <4 x i64> [[TMP34]] to <8 x i32>
+// CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i64>, <4 x i64>* [[__B_ADDR_I_I]], align 32
+// CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i64> [[TMP36]] to <8 x i32>
+// CHECK-NEXT:    [[TMP38:%.*]] = icmp ult <8 x i32> [[TMP35]], [[TMP37]]
+// CHECK-NEXT:    [[TMP39:%.*]] = select <8 x i1> [[TMP38]], <8 x i32> [[TMP35]], <8 x i32> [[TMP37]]
+// CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP39]] to <4 x i64>
+// CHECK-NEXT:    store <4 x i64> [[TMP40]], <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <4 x i64> [[TMP41]], <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT6_I]], <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i64>, <4 x i64>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <4 x i64> [[TMP42]], <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    store <2 x i64> [[EXTRACT7_I]], <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x i64>, <2 x i64>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <2 x i64>, <2 x i64>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP43]], <2 x i64>* [[__V1_ADDR_I14_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP44]], <2 x i64>* [[__V2_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I14_I]], align 16
+// CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i64> [[TMP45]] to <4 x i32>
+// CHECK-NEXT:    [[TMP47:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i64> [[TMP47]] to <4 x i32>
+// CHECK-NEXT:    [[TMP49:%.*]] = icmp ult <4 x i32> [[TMP46]], [[TMP48]]
+// CHECK-NEXT:    [[TMP50:%.*]] = select <4 x i1> [[TMP49]], <4 x i32> [[TMP46]], <4 x i32> [[TMP48]]
+// CHECK-NEXT:    [[TMP51:%.*]] = bitcast <4 x i32> [[TMP50]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP51]], <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP52:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i64> [[TMP52]] to <4 x i32>
+// CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP55:%.*]] = bitcast <2 x i64> [[TMP54]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP55]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    [[TMP56:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP56]], <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i64>, <2 x i64>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = load <2 x i64>, <2 x i64>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP57]], <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP58]], <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP59:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP60:%.*]] = bitcast <2 x i64> [[TMP59]] to <4 x i32>
+// CHECK-NEXT:    [[TMP61:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i64> [[TMP61]] to <4 x i32>
+// CHECK-NEXT:    [[TMP63:%.*]] = icmp ult <4 x i32> [[TMP60]], [[TMP62]]
+// CHECK-NEXT:    [[TMP64:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP60]], <4 x i32> [[TMP62]]
+// CHECK-NEXT:    [[TMP65:%.*]] = bitcast <4 x i32> [[TMP64]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP65]], <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP67:%.*]] = bitcast <2 x i64> [[TMP66]] to <4 x i32>
+// CHECK-NEXT:    [[TMP68:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP69:%.*]] = bitcast <2 x i64> [[TMP68]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> [[TMP69]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[TMP70:%.*]] = bitcast <4 x i32> [[SHUFFLE10_I]] to <2 x i64>
+// CHECK-NEXT:    store <2 x i64> [[TMP70]], <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP71:%.*]] = load <2 x i64>, <2 x i64>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP72:%.*]] = load <2 x i64>, <2 x i64>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP71]], <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP72]], <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP73:%.*]] = load <2 x i64>, <2 x i64>* [[__V1_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP74:%.*]] = bitcast <2 x i64> [[TMP73]] to <4 x i32>
+// CHECK-NEXT:    [[TMP75:%.*]] = load <2 x i64>, <2 x i64>* [[__V2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP76:%.*]] = bitcast <2 x i64> [[TMP75]] to <4 x i32>
+// CHECK-NEXT:    [[TMP77:%.*]] = icmp ult <4 x i32> [[TMP74]], [[TMP76]]
+// CHECK-NEXT:    [[TMP78:%.*]] = select <4 x i1> [[TMP77]], <4 x i32> [[TMP74]], <4 x i32> [[TMP76]]
+// CHECK-NEXT:    [[TMP79:%.*]] = bitcast <4 x i32> [[TMP78]] to <2 x i64>
+// CHECK-NEXT:    store <4 x i32> [[TMP78]], <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP80:%.*]] = load <4 x i32>, <4 x i32>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP80]], i32 0
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
 unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
   return _mm512_mask_reduce_min_epu32(__M, __W); 
 }
 
 // CHECK-LABEL: define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) #0 {
-// CHECK:   [[_COMPOUNDLITERAL_I_I18_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I19_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I20_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I15_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I16_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I17_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I12_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I13_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I14_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[_COMPOUNDLITERAL_I_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__B_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__W_ADDR_I_I:%.*]] = alloca float, align 4
-// CHECK:   [[_COMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__M_ADDR_I:%.*]] = alloca i16, align 2
-// CHECK:   [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
-// CHECK:   [[__M_ADDR:%.*]] = alloca i16, align 2
-// CHECK:   [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
-// CHECK:   store i16 %__M, i16* [[__M_ADDR]], align 2
-// CHECK:   store <16 x float> %__W, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
-// CHECK:   [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
-// CHECK:   store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
-// CHECK:   store <16 x float> [[TMP1]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
-// CHECK:   [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   store float 0x7FF0000000000000, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[TMP4:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <16 x float> undef, float [[TMP4]], i32 0
-// CHECK:   [[TMP5:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <16 x float> [[VECINIT_I_I]], float [[TMP5]], i32 1
-// CHECK:   [[TMP6:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <16 x float> [[VECINIT1_I_I]], float [[TMP6]], i32 2
-// CHECK:   [[TMP7:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <16 x float> [[VECINIT2_I_I]], float [[TMP7]], i32 3
-// CHECK:   [[TMP8:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT4_I_I:%.*]] = insertelement <16 x float> [[VECINIT3_I_I]], float [[TMP8]], i32 4
-// CHECK:   [[TMP9:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT5_I_I:%.*]] = insertelement <16 x float> [[VECINIT4_I_I]], float [[TMP9]], i32 5
-// CHECK:   [[TMP10:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT6_I_I:%.*]] = insertelement <16 x float> [[VECINIT5_I_I]], float [[TMP10]], i32 6
-// CHECK:   [[TMP11:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT7_I_I:%.*]] = insertelement <16 x float> [[VECINIT6_I_I]], float [[TMP11]], i32 7
-// CHECK:   [[TMP12:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT8_I_I:%.*]] = insertelement <16 x float> [[VECINIT7_I_I]], float [[TMP12]], i32 8
-// CHECK:   [[TMP13:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT9_I_I:%.*]] = insertelement <16 x float> [[VECINIT8_I_I]], float [[TMP13]], i32 9
-// CHECK:   [[TMP14:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT10_I_I:%.*]] = insertelement <16 x float> [[VECINIT9_I_I]], float [[TMP14]], i32 10
-// CHECK:   [[TMP15:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT11_I_I:%.*]] = insertelement <16 x float> [[VECINIT10_I_I]], float [[TMP15]], i32 11
-// CHECK:   [[TMP16:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT12_I_I:%.*]] = insertelement <16 x float> [[VECINIT11_I_I]], float [[TMP16]], i32 12
-// CHECK:   [[TMP17:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT13_I_I:%.*]] = insertelement <16 x float> [[VECINIT12_I_I]], float [[TMP17]], i32 13
-// CHECK:   [[TMP18:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT14_I_I:%.*]] = insertelement <16 x float> [[VECINIT13_I_I]], float [[TMP18]], i32 14
-// CHECK:   [[TMP19:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
-// CHECK:   [[VECINIT15_I_I:%.*]] = insertelement <16 x float> [[VECINIT14_I_I]], float [[TMP19]], i32 15
-// CHECK:   store <16 x float> [[VECINIT15_I_I]], <16 x float>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I]], align 64
-// CHECK:   [[TMP21:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-// CHECK:   [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x float> [[TMP3]], <16 x float> [[TMP20]]
-// CHECK:   store <16 x float> [[TMP22]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP24:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> [[TMP24]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP25:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE1_I:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> [[TMP26]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE_I]], <16 x float>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE1_I]], <16 x float>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   [[TMP27:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I19_I]], align 64
-// CHECK:   [[TMP28:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I20_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I18_I]], align 64
-// CHECK:   [[TMP30:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP27]], <16 x float> [[TMP28]], <16 x float> [[TMP29]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP30]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP31:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP32:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE3_I:%.*]] = shufflevector <16 x float> [[TMP31]], <16 x float> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP33:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP34:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE4_I:%.*]] = shufflevector <16 x float> [[TMP33]], <16 x float> [[TMP34]], <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE3_I]], <16 x float>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE4_I]], <16 x float>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   [[TMP35:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I16_I]], align 64
-// CHECK:   [[TMP36:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I17_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP37:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I15_I]], align 64
-// CHECK:   [[TMP38:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP35]], <16 x float> [[TMP36]], <16 x float> [[TMP37]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP38]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP39:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP40:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE6_I:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP40]], <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP41:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP42:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE7_I:%.*]] = shufflevector <16 x float> [[TMP41]], <16 x float> [[TMP42]], <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE6_I]], <16 x float>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE7_I]], <16 x float>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   [[TMP43:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I13_I]], align 64
-// CHECK:   [[TMP44:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I14_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP45:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I12_I]], align 64
-// CHECK:   [[TMP46:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP43]], <16 x float> [[TMP44]], <16 x float> [[TMP45]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP46]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP47:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP48:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE9_I:%.*]] = shufflevector <16 x float> [[TMP47]], <16 x float> [[TMP48]], <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   [[TMP49:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP50:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[SHUFFLE10_I:%.*]] = shufflevector <16 x float> [[TMP49]], <16 x float> [[TMP50]], <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-// CHECK:   store <16 x float> [[SHUFFLE9_I]], <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> [[SHUFFLE10_I]], <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   [[TMP51:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
-// CHECK:   [[TMP52:%.*]] = load <16 x float>, <16 x float>* [[__B_ADDR_I_I]], align 64
-// CHECK:   store <16 x float> zeroinitializer, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP53:%.*]] = load <16 x float>, <16 x float>* [[_COMPOUNDLITERAL_I_I_I]], align 64
-// CHECK:   [[TMP54:%.*]] = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> [[TMP51]], <16 x float> [[TMP52]], <16 x float> [[TMP53]], i16 -1, i32 4) #2
-// CHECK:   store <16 x float> [[TMP54]], <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[TMP55:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
-// CHECK:   [[VECEXT_I:%.*]] = extractelement <16 x float> [[TMP55]], i32 0
-// CHECK:   ret float [[VECEXT_I]]
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I16_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__B_ADDR_I17_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I14_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I15_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I12_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I13_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__A2_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__M_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__T1_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T2_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T3_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[__T4_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T5_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T6_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T7_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T8_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T9_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__T10_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store i16 [[__M:%.*]], i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    store <16 x float> [[__W:%.*]], <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[__M_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x float> [[TMP1]], <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store float 0x7FF0000000000000, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x float> [[VECINIT_I_I]], float [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x float> [[VECINIT1_I_I]], float [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x float> [[VECINIT2_I_I]], float [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x float> [[VECINIT3_I_I]], float [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x float> [[VECINIT4_I_I]], float [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x float> [[VECINIT5_I_I]], float [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x float> [[VECINIT6_I_I]], float [[TMP9]], i32 7
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x float> [[VECINIT7_I_I]], float [[TMP10]], i32 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x float> [[VECINIT8_I_I]], float [[TMP11]], i32 9
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x float> [[VECINIT9_I_I]], float [[TMP12]], i32 10
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x float> [[VECINIT10_I_I]], float [[TMP13]], i32 11
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x float> [[VECINIT11_I_I]], float [[TMP14]], i32 12
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x float> [[VECINIT12_I_I]], float [[TMP15]], i32 13
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x float> [[VECINIT13_I_I]], float [[TMP16]], i32 14
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[__W_ADDR_I_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x float> [[VECINIT14_I_I]], float [[TMP17]], i32 15
+// CHECK-NEXT:    store <16 x float> [[VECINIT15_I_I]], <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x float>, <16 x float>* [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load i16, i16* [[__M_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP18]], <16 x float>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP19]], i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    store <16 x float> [[TMP20]], <16 x float>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load i16, i16* [[__U_ADDR_I_I]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = load <16 x float>, <16 x float>* [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = load <16 x float>, <16 x float>* [[__W2_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast i16 [[TMP21]] to <16 x i1>
+// CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x float> [[TMP22]], <16 x float> [[TMP23]]
+// CHECK-NEXT:    store <16 x float> [[TMP25]], <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x float> [[TMP26]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x double> [[TMP27]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double> [[EXTRACT_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP28]], <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x float>, <16 x float>* [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x float> [[TMP29]] to <8 x double>
+// CHECK-NEXT:    [[EXTRACT4_I:%.*]] = shufflevector <8 x double> [[TMP30]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x double> [[EXTRACT4_I]] to <8 x float>
+// CHECK-NEXT:    store <8 x float> [[TMP31]], <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    [[TMP32:%.*]] = load <8 x float>, <8 x float>* [[__T1_I]], align 32
+// CHECK-NEXT:    [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[__T2_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP32]], <8 x float>* [[__A_ADDR_I16_I]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP33]], <8 x float>* [[__B_ADDR_I17_I]], align 32
+// CHECK-NEXT:    [[TMP34:%.*]] = load <8 x float>, <8 x float>* [[__A_ADDR_I16_I]], align 32
+// CHECK-NEXT:    [[TMP35:%.*]] = load <8 x float>, <8 x float>* [[__B_ADDR_I17_I]], align 32
+// CHECK-NEXT:    [[TMP36:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[TMP34]], <8 x float> [[TMP35]]) #2
+// CHECK-NEXT:    store <8 x float> [[TMP36]], <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[TMP37:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT6_I:%.*]] = shufflevector <8 x float> [[TMP37]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT6_I]], <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP38:%.*]] = load <8 x float>, <8 x float>* [[__T3_I]], align 32
+// CHECK-NEXT:    [[EXTRACT7_I:%.*]] = shufflevector <8 x float> [[TMP38]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    store <4 x float> [[EXTRACT7_I]], <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x float>, <4 x float>* [[__T4_I]], align 16
+// CHECK-NEXT:    [[TMP40:%.*]] = load <4 x float>, <4 x float>* [[__T5_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP39]], <4 x float>* [[__A_ADDR_I14_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP40]], <4 x float>* [[__B_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I14_I]], align 16
+// CHECK-NEXT:    [[TMP42:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I15_I]], align 16
+// CHECK-NEXT:    [[TMP43:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP41]], <4 x float> [[TMP42]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP43]], <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP44:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP45:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP44]], <4 x float> [[TMP45]], <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE_I]], <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    [[TMP46:%.*]] = load <4 x float>, <4 x float>* [[__T6_I]], align 16
+// CHECK-NEXT:    [[TMP47:%.*]] = load <4 x float>, <4 x float>* [[__T7_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP46]], <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP47]], <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP48:%.*]] = load <4 x float>, <4 x float>* [[__A_ADDR_I12_I]], align 16
+// CHECK-NEXT:    [[TMP49:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I13_I]], align 16
+// CHECK-NEXT:    [[TMP50:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP48]], <4 x float> [[TMP49]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP50]], <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE10_I:%.*]] = shufflevector <4 x float> [[TMP51]], <4 x float> [[TMP52]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    store <4 x float> [[SHUFFLE10_I]], <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    [[TMP53:%.*]] = load <4 x float>, <4 x float>* [[__T8_I]], align 16
+// CHECK-NEXT:    [[TMP54:%.*]] = load <4 x float>, <4 x float>* [[__T9_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP54]], <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP55:%.*]] = load <4 x float>, <4 x float>* [[__A2_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP56:%.*]] = load <4 x float>, <4 x float>* [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP57:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[TMP55]], <4 x float> [[TMP56]]) #2
+// CHECK-NEXT:    store <4 x float> [[TMP57]], <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[TMP58:%.*]] = load <4 x float>, <4 x float>* [[__T10_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP58]], i32 0
+// CHECK-NEXT:    ret float [[VECEXT_I]]
 float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
   return _mm512_mask_reduce_min_ps(__M, __W); 
 }
diff --git a/test/CodeGen/avx512bw-builtins.c b/test/CodeGen/avx512bw-builtins.c
index 3160a6667c00..7d2e1fadf4ce 100644
--- a/test/CodeGen/avx512bw-builtins.c
+++ b/test/CodeGen/avx512bw-builtins.c
@@ -684,7 +684,7 @@ __m512i test_mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK: add <64 x i16> %{{.*}}, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   // CHECK: lshr <64 x i16> %{{.*}}, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   // CHECK: trunc <64 x i16> %{{.*}} to <64 x i8>
-  // CHECK: store <64 x i8> zeroinitializer
+  // CHECK: store <8 x i64> zeroinitializer
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_avg_epu8(__U,__A,__B); 
 }
@@ -720,7 +720,7 @@ __m512i test_mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK: add <32 x i32> %{{.*}}, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   // CHECK: lshr <32 x i32> %{{.*}}, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   // CHECK: trunc <32 x i32> %{{.*}} to <32 x i16>
-  // CHECK: store <32 x i16> zeroinitializer
+  // CHECK: store <8 x i64> zeroinitializer
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_avg_epu16(__U,__A,__B); 
 }
@@ -963,99 +963,112 @@ __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
 }
 __m512i test_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.hi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask2_permutex2var_epi16(__A,__I,__U,__B); 
 }
 __m512i test_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.hi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
   return _mm512_permutex2var_epi16(__A,__I,__B); 
 }
 __m512i test_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.hi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_permutex2var_epi16(__A,__U,__I,__B); 
 }
 __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.hi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
 
 __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mulhrs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmul.hr.sw.512
+  // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
   return _mm512_mulhrs_epi16(__A,__B); 
 }
 __m512i test_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A,        __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_mulhrs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmul.hr.sw.512
+  // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mulhrs_epi16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_mulhrs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmul.hr.sw.512
+  // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mulhrs_epi16(__U,__A,__B); 
 }
 __m512i test_mm512_mulhi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mulhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmulh.w.512
+  // CHECK: @llvm.x86.avx512.pmulh.w.512
   return _mm512_mulhi_epi16(__A,__B); 
 }
 __m512i test_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,       __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_mulhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmulh.w.512
+  // CHECK: @llvm.x86.avx512.pmulh.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mulhi_epi16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_mulhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmulh.w.512
+  // CHECK: @llvm.x86.avx512.pmulh.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mulhi_epi16(__U,__A,__B); 
 }
 __m512i test_mm512_mulhi_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mulhi_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmulhu.w.512
+  // CHECK: @llvm.x86.avx512.pmulhu.w.512
   return _mm512_mulhi_epu16(__A,__B); 
 }
 __m512i test_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A,       __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_mulhi_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmulhu.w.512
+  // CHECK: @llvm.x86.avx512.pmulhu.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mulhi_epu16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_mulhi_epu16
-  // CHECK: @llvm.x86.avx512.mask.pmulhu.w.512
+  // CHECK: @llvm.x86.avx512.pmulhu.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mulhi_epu16(__U,__A,__B); 
 }
 
 __m512i test_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maddubs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddubs.w.512
+  // CHECK: @llvm.x86.avx512.pmaddubs.w.512
   return _mm512_maddubs_epi16(__X,__Y); 
 }
 __m512i test_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,         __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_maddubs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddubs.w.512
+  // CHECK: @llvm.x86.avx512.pmaddubs.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_maddubs_epi16(__W,__U,__X,__Y); 
 }
 __m512i test_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_maddubs_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddubs.w.512
+  // CHECK: @llvm.x86.avx512.pmaddubs.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_maddubs_epi16(__U,__X,__Y); 
 }
 __m512i test_mm512_madd_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_madd_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddw.d.512
+  // CHECK: @llvm.x86.avx512.pmaddw.d.512
   return _mm512_madd_epi16(__A,__B); 
 }
 __m512i test_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A,      __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_madd_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddw.d.512
+  // CHECK: @llvm.x86.avx512.pmaddw.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_madd_epi16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_madd_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmaddw.d.512
+  // CHECK: @llvm.x86.avx512.pmaddw.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_madd_epi16(__U,__A,__B); 
 }
 
@@ -1097,19 +1110,21 @@ __m256i test_mm512_maskz_cvtusepi16_epi8(__mmask32 __M, __m512i __A) {
 
 __m256i test_mm512_cvtepi16_epi8(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.512
+  // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
   return _mm512_cvtepi16_epi8(__A); 
 }
 
 __m256i test_mm512_mask_cvtepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.512
+  // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm512_mask_cvtepi16_epi8(__O, __M, __A); 
 }
 
 __m256i test_mm512_maskz_cvtepi16_epi8(__mmask32 __M, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.512
+  // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm512_maskz_cvtepi16_epi8(__M, __A); 
 }
 
@@ -1235,40 +1250,40 @@ __m512i test_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) {
 
 __m512i test_mm512_shufflehi_epi16(__m512i __A) {
   // CHECK-LABEL: @test_mm512_shufflehi_epi16
-  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   return _mm512_shufflehi_epi16(__A, 5); 
 }
 
 __m512i test_mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_shufflehi_epi16
-  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shufflehi_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_shufflehi_epi16
-  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shufflehi_epi16(__U, __A, 5); 
 }
 
 __m512i test_mm512_shufflelo_epi16(__m512i __A) {
   // CHECK-LABEL: @test_mm512_shufflelo_epi16
-  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   return _mm512_shufflelo_epi16(__A, 5); 
 }
 
 __m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_shufflelo_epi16
-  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shufflelo_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_shufflelo_epi16
-  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shufflelo_epi16(__U, __A, 5); 
 }
@@ -1335,7 +1350,7 @@ __m512i test_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A) {
 
 __m512i test_mm512_bslli_epi128(__m512i __A) {
   // CHECK-LABEL: @test_mm512_bslli_epi128
-  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+  // CHECK: shufflevector <64 x i8> zeroinitializer, <64 x i8> %{{.*}}, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
   return _mm512_bslli_epi128(__A, 5);
 }
 
@@ -1461,7 +1476,7 @@ __m512i test_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A) {
 
 __m512i test_mm512_bsrli_epi128(__m512i __A) {
   // CHECK-LABEL: @test_mm512_bsrli_epi128
-  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
   return _mm512_bsrli_epi128(__A, 5);
 }
 __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
@@ -1628,23 +1643,22 @@ __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
 
 __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackd
-  // CHECK: bitcast <64 x i1> %{{.*}} to i64
-  // CHECK: bitcast <64 x i1> %{{.*}} to i64
-  // CHECK: and i64 %{{.*}}, 4294967295
-  // CHECK: shl i64 %{{.*}}, 32
-  // CHECK: or i64 %{{.*}}, %{{.*}}
-  // CHECK: bitcast i64 %{{.*}} to <64 x i1>
+  // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // CHECK: [[LHS2:%.*]] = shufflevector <64 x i1> [[LHS]], <64 x i1> [[LHS]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: [[RHS2:%.*]] = shufflevector <64 x i1> [[RHS]], <64 x i1> [[RHS]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: [[CONCAT:%.*]] = shufflevector <32 x i1> [[RHS2]], <32 x i1> [[LHS2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  // CHECK: bitcast <64 x i1> [[CONCAT]] to i64
   return _mm512_mask_cmpneq_epu8_mask(_mm512_kunpackd(_mm512_cmpneq_epu8_mask(__B, __A),_mm512_cmpneq_epu8_mask(__C, __D)), __E, __F); 
 }
 
 __mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackw
-  // CHECK: bitcast <32 x i1> %{{.*}} to i32
-  // CHECK: bitcast <32 x i1> %{{.*}} to i32
-  // CHECK: and i32 %{{.*}}, 65535
-  // CHECK: shl i32 %{{.*}}, 16
-  // CHECK: or i32 %{{.*}}, %{{.*}}
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: [[LHS2:%.*]] = shufflevector <32 x i1> [[LHS]], <32 x i1> [[LHS]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: [[RHS2:%.*]] = shufflevector <32 x i1> [[RHS]], <32 x i1> [[RHS]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: [[CONCAT:%.*]] = shufflevector <16 x i1> [[RHS2]], <16 x i1> [[LHS2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   return _mm512_mask_cmpneq_epu16_mask(_mm512_kunpackw(_mm512_cmpneq_epu16_mask(__B, __A),_mm512_cmpneq_epu16_mask(__C, __D)), __E, __F); 
 }
 
@@ -1743,7 +1757,8 @@ __mmask32 test_mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i _
 
 __mmask64 test_mm512_movepi8_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi8_mask
-  // CHECK: @llvm.x86.avx512.cvtb2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <64 x i8> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <64 x i1> [[CMP]] to i64
   return _mm512_movepi8_mask(__A); 
 }
 
@@ -1763,40 +1778,40 @@ __m512i test_mm512_movm_epi16(__mmask32 __A) {
 
 __m512i test_mm512_broadcastb_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm512_broadcastb_epi8
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <64 x i32> zeroinitializer
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <64 x i32> zeroinitializer
   return _mm512_broadcastb_epi8(__A);
 }
 
 __m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcastb_epi8
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <64 x i32> zeroinitializer
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <64 x i32> zeroinitializer
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_broadcastb_epi8(__O, __M, __A);
 }
 
 __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcastb_epi8
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <64 x i32> zeroinitializer
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <64 x i32> zeroinitializer
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_broadcastb_epi8(__M, __A);
 }
 
 __m512i test_mm512_broadcastw_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm512_broadcastw_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> zeroinitializer, <32 x i32> zeroinitializer
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <32 x i32> zeroinitializer
   return _mm512_broadcastw_epi16(__A);
 }
 
 __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcastw_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> zeroinitializer, <32 x i32> zeroinitializer
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <32 x i32> zeroinitializer
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_broadcastw_epi16(__O, __M, __A);
 }
 
 __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcastw_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> zeroinitializer, <32 x i32> zeroinitializer
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <32 x i32> zeroinitializer
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_broadcastw_epi16(__M, __A);
 }
@@ -1878,19 +1893,21 @@ __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
 }
 __m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+  // CHECK: @llvm.x86.avx512.permvar.hi.512
  return _mm512_permutexvar_epi16(__A, __B); 
 }
 
 __m512i test_mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+  // CHECK: @llvm.x86.avx512.permvar.hi.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_permutexvar_epi16(__M, __A, __B); 
 }
 
 __m512i test_mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+  // CHECK: @llvm.x86.avx512.permvar.hi.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_permutexvar_epi16(__W, __M, __A, __B); 
 }
 __m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
@@ -1917,19 +1934,21 @@ __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
 
 __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mm_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  // CHECK: @llvm.x86.avx512.dbpsadbw.512
   return _mm512_dbsad_epu8(__A, __B, 170); 
 }
 
 __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mm_mask_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  // CHECK: @llvm.x86.avx512.dbpsadbw.512
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
 }
 
 __m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mm_maskz_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  // CHECK: @llvm.x86.avx512.dbpsadbw.512
+  //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); 
 }
 
@@ -1941,7 +1960,8 @@ __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
 
 __mmask32 test_mm512_movepi16_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi16_mask
-  // CHECK: @llvm.x86.avx512.cvtw2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <32 x i16> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <32 x i1> [[CMP]] to i32
   return _mm512_movepi16_mask(__A); 
 }
 
diff --git a/test/CodeGen/avx512dq-builtins.c b/test/CodeGen/avx512dq-builtins.c
index 1b21ca3c4302..6ac3aff0aa3c 100644
--- a/test/CodeGen/avx512dq-builtins.c
+++ b/test/CodeGen/avx512dq-builtins.c
@@ -6,7 +6,7 @@
 __m512i test_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mullo_epi64
   // CHECK: mul <8 x i64>
-  return (__m512i) ((__v8di) __A * (__v8di) __B);
+  return (__m512i) _mm512_mullo_epi64(__A, __B);
 }
 
 __m512i test_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
@@ -347,19 +347,21 @@ __m512i test_mm512_maskz_cvt_roundps_epu64(__mmask8 __U, __m256 __A) {
 
 __m512d test_mm512_cvtepi64_pd(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.512
+  // CHECK: sitofp <8 x i64> %{{.*}} to <8 x double>
   return _mm512_cvtepi64_pd(__A); 
 }
 
 __m512d test_mm512_mask_cvtepi64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.512
+  // CHECK: sitofp <8 x i64> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvtepi64_pd(__W, __U, __A); 
 }
 
 __m512d test_mm512_maskz_cvtepi64_pd(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.512
+  // CHECK: sitofp <8 x i64> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_cvtepi64_pd(__U, __A); 
 }
 
@@ -563,19 +565,21 @@ __m512i test_mm512_maskz_cvtt_roundps_epu64(__mmask8 __U, __m256 __A) {
 
 __m512d test_mm512_cvtepu64_pd(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.512
+  // CHECK: uitofp <8 x i64> %{{.*}} to <8 x double>
   return _mm512_cvtepu64_pd(__A); 
 }
 
 __m512d test_mm512_mask_cvtepu64_pd(__m512d __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.512
+  // CHECK: uitofp <8 x i64> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvtepu64_pd(__W, __U, __A); 
 }
 
 __m512d test_mm512_maskz_cvtepu64_pd(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.512
+  // CHECK: uitofp <8 x i64> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_cvtepu64_pd(__U, __A); 
 }
 
@@ -687,13 +691,13 @@ __m128d test_mm512_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B)
   return _mm_maskz_range_round_sd(__U, __A, __B, 4, 8); 
 }
 
-__m128d test_mm512_range_round_ss(__m128d __A, __m128d __B) {
+__m128 test_mm512_range_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm512_range_round_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_range_round_ss(__A, __B, 4, 8); 
 }
 
-__m128d test_mm512_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+__m128 test_mm512_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm512_mask_range_round_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_mask_range_round_ss(__W, __U, __A, __B, 4, 8); 
@@ -723,13 +727,13 @@ __m128d test_mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   return _mm_maskz_range_sd(__U, __A, __B, 4); 
 }
 
-__m128d test_mm_range_ss(__m128d __A, __m128d __B) {
+__m128 test_mm_range_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_range_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_range_ss(__A, __B, 4); 
 }
 
-__m128d test_mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+__m128 test_mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_range_ss
   // CHECK: @llvm.x86.avx512.mask.range.ss
   return _mm_mask_range_ss(__W, __U, __A, __B, 4); 
@@ -923,7 +927,8 @@ __m128d test_mm_maskz_reduce_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 
 __mmask16 test_mm512_movepi32_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi32_mask
-  // CHECK: @llvm.x86.avx512.cvtd2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <16 x i32> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <16 x i1> [[CMP]] to i16
   return _mm512_movepi32_mask(__A); 
 }
 
@@ -943,26 +948,27 @@ __m512i test_mm512_movm_epi64(__mmask8 __A) {
 
 __mmask8 test_mm512_movepi64_mask(__m512i __A) {
   // CHECK-LABEL: @test_mm512_movepi64_mask
-  // CHECK: @llvm.x86.avx512.cvtq2mask.512
+  // CHECK: [[CMP:%.*]] = icmp slt <8 x i64> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <8 x i1> [[CMP]] to i8
   return _mm512_movepi64_mask(__A); 
 }
 
 __m512 test_mm512_broadcast_f32x2(__m128 __A) {
   // CHECK-LABEL: @test_mm512_broadcast_f32x2
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_f32x2(__A); 
 }
 
 __m512 test_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcast_f32x2
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_broadcast_f32x2(__O, __M, __A); 
 }
 
 __m512 test_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x2
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_broadcast_f32x2(__M, __A); 
 }
@@ -1009,20 +1015,20 @@ __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
 
 __m512i test_mm512_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: @test_mm512_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_i32x2(__A); 
 }
 
 __m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
 __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_broadcast_i32x2(__M, __A); 
 }
@@ -1069,80 +1075,80 @@ __m512i test_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) {
 
 __m256 test_mm512_extractf32x8_ps(__m512 __A) {
   // CHECK-LABEL: @test_mm512_extractf32x8_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extractf32x8_ps(__A, 1); 
 }
 
 __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_mask_extractf32x8_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); 
 }
 
 __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_maskz_extractf32x8_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_extractf32x8_ps(__U, __A, 1); 
 }
 
 __m128d test_mm512_extractf64x2_pd(__m512d __A) {
   // CHECK-LABEL: @test_mm512_extractf64x2_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   return _mm512_extractf64x2_pd(__A, 3); 
 }
 
 __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_extractf64x2_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); 
 }
 
 __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_maskz_extractf64x2_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_maskz_extractf64x2_pd(__U, __A, 3); 
 }
 
 __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti32x8_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x8_epi32(__A, 1); 
 }
 
 __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti32x8_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); 
 }
 
 __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti32x8_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); 
 }
 
 __m128i test_mm512_extracti64x2_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti64x2_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   return _mm512_extracti64x2_epi64(__A, 3); 
 }
 
 __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti64x2_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); 
 }
 
 __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti64x2_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); 
 }
@@ -1228,35 +1234,35 @@ __m512i test_mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B) {
 }
 __mmask8 test_mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_fpclass_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.512
+  // CHECK: @llvm.x86.avx512.fpclass.pd.512
   return _mm512_mask_fpclass_pd_mask(__U, __A, 4); 
 }
 
 __mmask8 test_mm512_fpclass_pd_mask(__m512d __A) {
   // CHECK-LABEL: @test_mm512_fpclass_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.512
+  // CHECK: @llvm.x86.avx512.fpclass.pd.512
   return _mm512_fpclass_pd_mask(__A, 4); 
 }
 
 __mmask16 test_mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_mask_fpclass_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.512
+  // CHECK: @llvm.x86.avx512.fpclass.ps.512
   return _mm512_mask_fpclass_ps_mask(__U, __A, 4); 
 }
 
 __mmask16 test_mm512_fpclass_ps_mask(__m512 __A) {
   // CHECK-LABEL: @test_mm512_fpclass_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.512
+  // CHECK: @llvm.x86.avx512.fpclass.ps.512
   return _mm512_fpclass_ps_mask(__A, 4); 
 }
 
-__mmask8 test_mm_fpclass_sd_mask(__m128 __A)  { 
+__mmask8 test_mm_fpclass_sd_mask(__m128d __A)  { 
   // CHECK-LABEL: @test_mm_fpclass_sd_mask
   // CHECK: @llvm.x86.avx512.mask.fpclass.sd
  return _mm_fpclass_sd_mask (__A, 2);
 }
 
-__mmask8 test_mm_mask_fpclass_sd_mask(__mmask8 __U, __m128 __A)  {
+__mmask8 test_mm_mask_fpclass_sd_mask(__mmask8 __U, __m128d __A)  {
  // CHECK-LABEL: @test_mm_mask_fpclass_sd_mask
  // CHECK: @llvm.x86.avx512.mask.fpclass.sd
  return _mm_mask_fpclass_sd_mask (__U,  __A, 2);
diff --git a/test/CodeGen/avx512f-builtins.c b/test/CodeGen/avx512f-builtins.c
index ce831d690ee7..17cbc62d6149 100644
--- a/test/CodeGen/avx512f-builtins.c
+++ b/test/CodeGen/avx512f-builtins.c
@@ -5,85 +5,101 @@
 __m512d test_mm512_sqrt_pd(__m512d a)
 {
   // CHECK-LABEL: @test_mm512_sqrt_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}})
   return _mm512_sqrt_pd(a);
 }
 
 __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
   // CHECK-LABEL: @test_mm512_mask_sqrt_pd 
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_sqrt_pd (__W,__U,__A);
 }
 
 __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_sqrt_pd 
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}}
   return _mm512_maskz_sqrt_pd (__U,__A);
 }
 
 __m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A)
 {
   // CHECK-LABEL: @test_mm512_mask_sqrt_round_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
-  return _mm512_mask_sqrt_round_pd(__W,__U,__A,_MM_FROUND_CUR_DIRECTION);
+  // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 8)
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_sqrt_round_pd(__W,__U,__A,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512d test_mm512_maskz_sqrt_round_pd(__mmask8 __U,__m512d __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_sqrt_round_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
-  return _mm512_maskz_sqrt_round_pd(__U,__A,_MM_FROUND_CUR_DIRECTION);
+  // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 8)
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}}
+  return _mm512_maskz_sqrt_round_pd(__U,__A,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512d test_mm512_sqrt_round_pd(__m512d __A)
 {
   // CHECK-LABEL: @test_mm512_sqrt_round_pd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
-  return _mm512_sqrt_round_pd(__A,_MM_FROUND_CUR_DIRECTION);
+  // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 8)
+  return _mm512_sqrt_round_pd(__A,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512 test_mm512_sqrt_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_sqrt_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}})
   return _mm512_sqrt_ps(a);
 }
 
 __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
 {
   // CHECK-LABEL: @test_mm512_mask_sqrt_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_sqrt_ps( __W, __U, __A);
 }
 
 __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_sqrt_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}}
   return _mm512_maskz_sqrt_ps(__U ,__A);
 }
 
 __m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A)
 {
   // CHECK-LABEL: @test_mm512_mask_sqrt_round_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
-  return _mm512_mask_sqrt_round_ps(__W,__U,__A,_MM_FROUND_CUR_DIRECTION);
+  // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 8)
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_sqrt_round_ps(__W,__U,__A,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512 test_mm512_maskz_sqrt_round_ps(__mmask16 __U,__m512 __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
-  return _mm512_maskz_sqrt_round_ps(__U,__A,_MM_FROUND_CUR_DIRECTION);
+  // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 8)
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}}
+  return _mm512_maskz_sqrt_round_ps(__U,__A,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512 test_mm512_sqrt_round_ps(__m512 __A)
 {
   // CHECK-LABEL: @test_mm512_sqrt_round_ps
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
-  return _mm512_sqrt_round_ps(__A,_MM_FROUND_CUR_DIRECTION);
+  // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 8)
+  return _mm512_sqrt_round_ps(__A,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512d test_mm512_rsqrt14_pd(__m512d a)
@@ -159,7 +175,7 @@ __m512d test_mm512_mul_pd(__m512d a, __m512d b)
 void test_mm512_storeu_si512 (void *__P, __m512i __A)
 {
   // CHECK-LABEL: @test_mm512_storeu_si512
-  // CHECK: store <16 x i32> %{{.*}}, <16 x i32>* %{{.*}}, align 1{{$}}
+  // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 1{{$}}
   // CHECK-NEXT: ret void
   _mm512_storeu_si512 ( __P,__A);
 }
@@ -253,7 +269,7 @@ void test_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) {
 __m512i test_mm512_loadu_si512 (void *__P)
 {
   // CHECK-LABEL: @test_mm512_loadu_si512 
-  // CHECK: load <16 x i32>, <16 x i32>* %{{.*}}, align 1{{$}}
+  // CHECK: load <8 x i64>, <8 x i64>* %{{.*}}, align 1{{$}}
   return _mm512_loadu_si512 ( __P);
 }
 
@@ -439,483 +455,745 @@ __m512i test_mm512_maskz_alignr_epi64( __mmask8 u, __m512i a, __m512i b)
 
 __m512d test_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512d test_mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsub_round_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fnmadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fnmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fnmsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fnmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fmadd_pd(__A, __B, __C);
 }
 __m512d test_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmadd_pd(__A, __U, __B, __C);
 }
 __m512d test_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmadd_pd(__A, __B, __C, __U);
 }
 __m512d test_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fmsub_pd(__A, __B, __C);
 }
 __m512d test_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsub_pd(__A, __U, __B, __C);
 }
 __m512d test_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fnmadd_pd(__A, __B, __C);
 }
 __m512d test_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmadd_pd(__A, __B, __C, __U);
 }
 __m512d test_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fnmsub_pd(__A, __B, __C);
 }
 __m512d test_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_pd(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsub_round_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fnmadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fnmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fnmsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fnmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fmadd_ps(__A, __B, __C);
 }
 __m512 test_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_mask_fmadd_ps(__A, __U, __B, __C);
 }
 __m512 test_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmadd_ps(__A, __B, __C, __U);
 }
 __m512 test_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmadd_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fmsub_ps(__A, __B, __C);
 }
 __m512 test_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsub_ps(__A, __U, __B, __C);
 }
 __m512 test_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsub_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fnmadd_ps(__A, __B, __C);
 }
 __m512 test_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmadd_ps(__A, __B, __C, __U);
 }
 __m512 test_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmadd_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fnmsub_ps(__A, __B, __C);
 }
 __m512 test_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmsub_ps(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
   return _mm512_fmaddsub_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmaddsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmaddsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmaddsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmsubadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
   return _mm512_fmsubadd_round_pd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsubadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsubadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: shufflevector <8 x double> [[SUB]], <8 x double> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   return _mm512_fmaddsub_pd(__A, __B, __C);
 }
 __m512d test_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: shufflevector <8 x double> [[SUB]], <8 x double> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmaddsub_pd(__A, __U, __B, __C);
 }
 __m512d test_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.512
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: shufflevector <8 x double> [[SUB]], <8 x double> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmaddsub_pd(__A, __B, __C, __U);
 }
 __m512d test_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: shufflevector <8 x double> [[SUB]], <8 x double> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmaddsub_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: shufflevector <8 x double> [[ADD]], <8 x double> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   return _mm512_fmsubadd_pd(__A, __B, __C);
 }
 __m512d test_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.512
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: shufflevector <8 x double> [[ADD]], <8 x double> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsubadd_pd(__A, __U, __B, __C);
 }
 __m512d test_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.512
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: shufflevector <8 x double> [[ADD]], <8 x double> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsubadd_pd(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
   return _mm512_fmaddsub_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmaddsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmaddsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmaddsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmsubadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
   return _mm512_fmsubadd_round_ps(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsubadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsubadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: shufflevector <16 x float> [[SUB]], <16 x float> [[ADD]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   return _mm512_fmaddsub_ps(__A, __B, __C);
 }
 __m512 test_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: shufflevector <16 x float> [[SUB]], <16 x float> [[ADD]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmaddsub_ps(__A, __U, __B, __C);
 }
 __m512 test_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.512
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: shufflevector <16 x float> [[SUB]], <16 x float> [[ADD]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmaddsub_ps(__A, __B, __C, __U);
 }
 __m512 test_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: shufflevector <16 x float> [[SUB]], <16 x float> [[ADD]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmaddsub_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: shufflevector <16 x float> [[ADD]], <16 x float> [[SUB]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   return _mm512_fmsubadd_ps(__A, __B, __C);
 }
 __m512 test_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.512
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: shufflevector <16 x float> [[ADD]], <16 x float> [[SUB]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsubadd_ps(__A, __U, __B, __C);
 }
 __m512 test_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_maskz_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.512
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: shufflevector <16 x float> [[ADD]], <16 x float> [[SUB]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsubadd_ps(__U, __A, __B, __C);
 }
 __m512d test_mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsub_pd(__A, __B, __C, __U);
 }
 __m512 test_mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsub_ps(__A, __B, __C, __U);
 }
 __m512d test_mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsubadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.512
+  // CHECK: [[NEG:%.+]] = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: shufflevector <8 x double> [[ADD]], <8 x double> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsubadd_pd(__A, __B, __C, __U);
 }
 __m512 test_mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsubadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.512
+  // CHECK: [[NEG:%.+]] = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: shufflevector <16 x float> [[ADD]], <16 x float> [[SUB]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsubadd_ps(__A, __B, __C, __U);
 }
 __m512d test_mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmadd_pd(__A, __U, __B, __C);
 }
 __m512 test_mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmadd_ps(__A, __U, __B, __C);
 }
 __m512d test_mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmsub_pd(__A, __U, __B, __C);
 }
 __m512d test_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.512
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmsub_pd(__A, __B, __C, __U);
 }
 __m512 test_mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
   // CHECK-LABEL: @test_mm512_mask_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmsub_ps(__A, __U, __B, __C);
 }
 __m512 test_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
   // CHECK-LABEL: @test_mm512_mask3_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.512
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmsub_ps(__A, __B, __C, __U);
 }
 
@@ -1001,261 +1279,1094 @@ __m512 test_mm512_unpacklo_ps(__m512 a, __m512 b)
 
 __mmask16 test_mm512_cmp_round_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmp_round_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_round_ps_mask(a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask16 test_mm512_mask_cmp_round_ps_mask(__mmask16 m, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_round_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_round_ps_mask(m, a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
-__mmask16 test_mm512_cmp_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmp_ps_mask
-  // CHECKn: @llvm.x86.avx512.mask.cmp.ps.512
-  return _mm512_cmp_ps_mask(a, b, 0);
+__mmask16 test_mm512_cmp_ps_mask_eq_oq(__m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_cmp_ps_mask_eq_oq
+  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_lt_os(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_lt_os
+  // CHECK: fcmp olt <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_le_os(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_le_os
+  // CHECK: fcmp ole <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_unord_q(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_unord_q
+  // CHECK: fcmp uno <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_neq_uq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_uq
+  // CHECK: fcmp une <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_nlt_us(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_nlt_us
+  // CHECK: fcmp uge <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_nle_us(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_nle_us
+  // CHECK: fcmp ugt <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_ord_q(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_ord_q
+  // CHECK: fcmp ord <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_eq_uq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_eq_uq
+  // CHECK: fcmp ueq <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_nge_us(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_nge_us
+  // CHECK: fcmp ult <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_ngt_us(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_ngt_us
+  // CHECK: fcmp ule <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_false_oq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_false_oq
+  // CHECK: fcmp false <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_neq_oq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_oq
+  // CHECK: fcmp one <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_ge_os(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_ge_os
+  // CHECK: fcmp oge <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_gt_os(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_gt_os
+  // CHECK: fcmp ogt <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_true_uq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_true_uq
+  // CHECK: fcmp true <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_eq_os(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_eq_os
+  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_lt_oq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_lt_oq
+  // CHECK: fcmp olt <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_le_oq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_le_oq
+  // CHECK: fcmp ole <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
 }
 
-__mmask16 test_mm512_mask_cmp_ps_mask(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
-  return _mm512_mask_cmp_ps_mask(m, a, b, 0);
+__mmask16 test_mm512_cmp_ps_mask_unord_s(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_unord_s
+  // CHECK: fcmp uno <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_neq_us(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_us
+  // CHECK: fcmp une <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_nlt_uq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_nlt_uq
+  // CHECK: fcmp uge <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_nle_uq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_nle_uq
+  // CHECK: fcmp ugt <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_ord_s(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_ord_s
+  // CHECK: fcmp ord <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_eq_us(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_eq_us
+  // CHECK: fcmp ueq <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_nge_uq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_nge_uq
+  // CHECK: fcmp ult <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_ngt_uq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_ngt_uq
+  // CHECK: fcmp ule <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_false_os(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_false_os
+  // CHECK: fcmp false <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_neq_os(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_os
+  // CHECK: fcmp one <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_ge_oq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_ge_oq
+  // CHECK: fcmp oge <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_gt_oq(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_gt_oq
+  // CHECK: fcmp ogt <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask16 test_mm512_cmp_ps_mask_true_us(__m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_cmp_ps_mask_true_us
+  // CHECK: fcmp true <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ps_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_eq_oq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_ps_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_lt_os(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_le_os(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_unord_q(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_neq_uq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_nlt_us(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_nle_us(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_ord_q(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_eq_uq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_nge_us(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_ngt_us(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_false_oq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_neq_oq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_ge_os(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_gt_os(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_true_uq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_eq_os(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_lt_oq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_le_oq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_unord_s(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_neq_us(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_nlt_uq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_nle_uq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_ord_s(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_eq_us(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_nge_uq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_ngt_uq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_false_os(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_neq_os(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_ge_oq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_gt_oq(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask16 test_mm512_mask_cmp_ps_mask_true_us(__mmask16 m, __m512 a, __m512 b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_US);
 }
 
 __mmask8 test_mm512_cmp_round_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmp_round_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_round_pd_mask(a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask8 test_mm512_mask_cmp_round_pd_mask(__mmask8 m, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_round_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_round_pd_mask(m, a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
-__mmask8 test_mm512_cmp_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmp_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
-  return _mm512_cmp_pd_mask(a, b, 0);
+__mmask8 test_mm512_cmp_pd_mask_eq_oq(__m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_cmp_pd_mask_eq_oq
+  // CHECK: fcmp oeq <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_lt_os(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_lt_os
+  // CHECK: fcmp olt <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_le_os(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_le_os
+  // CHECK: fcmp ole <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_unord_q(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_unord_q
+  // CHECK: fcmp uno <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_neq_uq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_uq
+  // CHECK: fcmp une <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_nlt_us(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_nlt_us
+  // CHECK: fcmp uge <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_nle_us(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_nle_us
+  // CHECK: fcmp ugt <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_ord_q(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_ord_q
+  // CHECK: fcmp ord <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_eq_uq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_eq_uq
+  // CHECK: fcmp ueq <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_nge_us(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_nge_us
+  // CHECK: fcmp ult <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_ngt_us(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_ngt_us
+  // CHECK: fcmp ule <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_false_oq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_false_oq
+  // CHECK: fcmp false <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_neq_oq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_oq
+  // CHECK: fcmp one <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_ge_os(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_ge_os
+  // CHECK: fcmp oge <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_gt_os(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_gt_os
+  // CHECK: fcmp ogt <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_true_uq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_true_uq
+  // CHECK: fcmp true <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_eq_os(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_eq_os
+  // CHECK: fcmp oeq <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_lt_oq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_lt_oq
+  // CHECK: fcmp olt <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_le_oq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_le_oq
+  // CHECK: fcmp ole <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_unord_s(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_unord_s
+  // CHECK: fcmp uno <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_neq_us(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_us
+  // CHECK: fcmp une <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_nlt_uq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_nlt_uq
+  // CHECK: fcmp uge <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_nle_uq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_nle_uq
+  // CHECK: fcmp ugt <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_ord_s(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_ord_s
+  // CHECK: fcmp ord <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_eq_us(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_eq_us
+  // CHECK: fcmp ueq <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_nge_uq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_nge_uq
+  // CHECK: fcmp ult <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_ngt_uq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_ngt_uq
+  // CHECK: fcmp ule <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_false_os(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_false_os
+  // CHECK: fcmp false <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_neq_os(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_os
+  // CHECK: fcmp one <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_ge_oq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_ge_oq
+  // CHECK: fcmp oge <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_gt_oq(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_gt_oq
+  // CHECK: fcmp ogt <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm512_cmp_pd_mask_true_us(__m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_cmp_pd_mask_true_us
+  // CHECK: fcmp true <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmp_pd_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_eq_oq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_pd_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_lt_os(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_le_os(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_unord_q(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_neq_uq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_nlt_us(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_nle_us(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_ord_q(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_eq_uq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_nge_us(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_ngt_us(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_false_oq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_neq_oq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_ge_os(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_gt_os(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_true_uq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_eq_os(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_lt_oq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_le_oq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_unord_s(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_neq_us(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_nlt_uq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_nle_uq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_ord_s(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_eq_us(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_nge_uq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_ngt_uq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_false_os(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_neq_os(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_ge_oq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_gt_oq(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm512_mask_cmp_pd_mask_true_us(__mmask8 m, __m512d a, __m512d b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_US);
 }
 
 __mmask8 test_mm512_mask_cmp_pd_mask(__mmask8 m, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, 0);
 }
 
 __mmask8 test_mm512_cmpeq_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmpeq_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: fcmp oeq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpeq_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmpeq_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmpeq_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmpeq_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpeq_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmpeq_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpeq_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmpeq_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmpeq_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpeq_ps_mask(k, a, b);
 }
 
 __mmask8 test_mm512_cmple_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmple_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
-  return _mm512_cmpeq_pd_mask(a, b);
+  // CHECK: fcmp ole <8 x double> %{{.*}}, %{{.*}}
+  return _mm512_cmple_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmple_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmple_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmple_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
-  return _mm512_cmpeq_ps_mask(a, b);
+  // CHECK: fcmp ole <16 x float> %{{.*}}, %{{.*}}
+  return _mm512_cmple_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmple_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp ole <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmple_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmple_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmple_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp ole <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmple_ps_mask(k, a, b);
 }
 
 __mmask8 test_mm512_cmplt_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmplt_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: fcmp olt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmplt_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmplt_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmplt_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmplt_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp olt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmplt_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmplt_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp olt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmplt_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmplt_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmplt_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp olt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmplt_ps_mask(k, a, b);
 }
 
 __mmask8 test_mm512_cmpneq_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmpneq_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: fcmp une <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpneq_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmpneq_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmpneq_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmpneq_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp une <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpneq_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmpneq_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp une <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpneq_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmpneq_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmpneq_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp une <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpneq_ps_mask(k, a, b);
 }
 
 __mmask8 test_mm512_cmpnle_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmpnle_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: fcmp ugt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpnle_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmpnle_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmpnle_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmpnle_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp ugt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpnle_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmpnle_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmpnle_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnle_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmpnle_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmpnle_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmpnle_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnle_ps_mask(k, a, b);
 }
 
 __mmask8 test_mm512_cmpnlt_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmpnlt_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: fcmp uge <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpnlt_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmpnlt_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmpnlt_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmpnlt_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp uge <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpnlt_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmpnlt_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmpnlt_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp uge <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnlt_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmpnlt_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmpnlt_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmpnlt_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp uge <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnlt_ps_mask(k, a, b);
 }
 
 __mmask8 test_mm512_cmpord_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmpord_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: fcmp ord <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpord_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmpord_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmpord_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmpord_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp ord <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpord_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmpord_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmpord_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp ord <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpord_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmpord_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmpord_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmpord_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp ord <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpord_ps_mask(k, a, b);
 }
 
 __mmask8 test_mm512_cmpunord_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmpunord_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: fcmp uno <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpunord_pd_mask(a, b);
 }
 
-__mmask8 test_mm512_cmpunord_ps_mask(__m512 a, __m512 b) {
+__mmask16 test_mm512_cmpunord_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmpunord_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: fcmp uno <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpunord_ps_mask(a, b);
 }
 
 __mmask8 test_mm512_mask_cmpunord_pd_mask(__mmask8 k, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmpunord_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
+  // CHECK: [[CMP:%.*]] = fcmp uno <8 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpunord_pd_mask(k, a, b);
 }
 
-__mmask8 test_mm512_mask_cmpunord_ps_mask(__mmask8 k, __m512 a, __m512 b) {
+__mmask16 test_mm512_mask_cmpunord_ps_mask(__mmask16 k, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmpunord_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
+  // CHECK: [[CMP:%.*]] = fcmp uno <16 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpunord_ps_mask(k, a, b);
 }
 
 __m256d test_mm512_extractf64x4_pd(__m512d a)
 {
   // CHECK-LABEL: @test_mm512_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf64x4_pd(a, 1);
 }
 
 __m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
   // CHECK-LABEL:@test_mm512_mask_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
 }
 
 __m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
   // CHECK-LABEL:@test_mm512_maskz_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
 }
@@ -1263,20 +2374,20 @@ __m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
 __m128 test_mm512_extractf32x4_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf32x4_ps(a, 1);
 }
 
-__m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512d __A){
+__m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512 __A){
   // CHECK-LABEL:@test_mm512_mask_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
 }
 
-__m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512d __A){
+__m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512 __A){
   // CHECK-LABEL:@test_mm512_maskz_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
  return _mm512_maskz_extractf32x4_ps(  __U, __A, 1);
 }
@@ -1870,42 +2981,58 @@ __m512i test_mm512_add_epi64(__m512i __A, __m512i __B) {
 
 __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mul_epi32
-  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epi32(__A,__B);
 }
 
-__m512i test_mm512_maskz_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
+__m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mul_epi32
-  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epi32(__k,__A,__B);
 }
 
-__m512i test_mm512_mask_mul_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
-                                   __m512i __src) {
+__m512i test_mm512_mask_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_mul_epi32
-  //CHECK: @llvm.x86.avx512.pmul.dq.512
+  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epi32(__src,__k,__A,__B);
 }
 
 __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_mul_epu32
-  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epu32(__A,__B);
 }
 
-__m512i test_mm512_maskz_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B) {
+__m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_maskz_mul_epu32
-  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epu32(__k,__A,__B);
 }
 
-__m512i test_mm512_mask_mul_epu32 (__mmask16 __k,__m512i __A, __m512i __B, 
-                                   __m512i __src) {
+__m512i test_mm512_mask_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_mul_epu32
-  //CHECK: @llvm.x86.avx512.pmulu.dq.512
+  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epu32(__src,__k,__A,__B);
 }
@@ -1930,19 +3057,34 @@ __m512i test_mm512_mullo_epi32(__m512i __A, __m512i __B) {
   return _mm512_mullo_epi32(__A,__B);
 }
 
+__m512i test_mm512_mullox_epi64 (__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mullox_epi64
+  // CHECK: mul <8 x i64>
+  return (__m512i) _mm512_mullox_epi64(__A, __B);
+}
+
+__m512i test_mm512_mask_mullox_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_mullox_epi64
+  // CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return (__m512i) _mm512_mask_mullox_epi64(__W, __U, __A, __B);
+}
+
 __m512d test_mm512_add_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_add_round_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.512
+  // CHECK: @llvm.x86.avx512.add.pd.512
   return _mm512_add_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_add_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_add_round_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.512
+  // CHECK: @llvm.x86.avx512.add.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_add_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_add_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_add_round_pd
-  // CHECK: @llvm.x86.avx512.mask.add.pd.512
+  // CHECK: @llvm.x86.avx512.add.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_add_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -1959,17 +3101,19 @@ __m512d test_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
 }
 __m512 test_mm512_add_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_add_round_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.512
+  // CHECK: @llvm.x86.avx512.add.ps.512
   return _mm512_add_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_add_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_add_round_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.512
+  // CHECK: @llvm.x86.avx512.add.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_add_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_add_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_add_round_ps
-  // CHECK: @llvm.x86.avx512.mask.add.ps.512
+  // CHECK: @llvm.x86.avx512.add.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_add_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
@@ -2001,12 +3145,32 @@ __m128 test_mm_maskz_add_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 }
 __m128 test_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_add_ss
-  // CHECK: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fadd float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_mask_add_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_add_ss
-  // CHECK: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fadd float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_maskz_add_ss(__U,__A,__B); 
 }
 __m128d test_mm_add_round_sd(__m128d __A, __m128d __B) {
@@ -2026,27 +3190,49 @@ __m128d test_mm_maskz_add_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 }
 __m128d test_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_add_sd
-  // CHECK: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fadd double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_mask_add_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_add_sd
-  // CHECK: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fadd double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_maskz_add_sd(__U,__A,__B); 
 }
 __m512d test_mm512_sub_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_sub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.512
+  // CHECK: @llvm.x86.avx512.sub.pd.512
   return _mm512_sub_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_sub_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_sub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.512
+  // CHECK: @llvm.x86.avx512.sub.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_sub_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_sub_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_sub_round_pd
-  // CHECK: @llvm.x86.avx512.mask.sub.pd.512
+  // CHECK: @llvm.x86.avx512.sub.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_sub_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -2063,17 +3249,19 @@ __m512d test_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
 }
 __m512 test_mm512_sub_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_sub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.512
+  // CHECK: @llvm.x86.avx512.sub.ps.512
   return _mm512_sub_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_sub_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_sub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.512
+  // CHECK: @llvm.x86.avx512.sub.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_sub_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_sub_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_sub_round_ps
-  // CHECK: @llvm.x86.avx512.mask.sub.ps.512
+  // CHECK: @llvm.x86.avx512.sub.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_sub_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
@@ -2105,12 +3293,32 @@ __m128 test_mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 }
 __m128 test_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_sub_ss
-  // CHECK: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fsub float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_mask_sub_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_ss
-  // CHECK: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fsub float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_maskz_sub_ss(__U,__A,__B); 
 }
 __m128d test_mm_sub_round_sd(__m128d __A, __m128d __B) {
@@ -2130,27 +3338,49 @@ __m128d test_mm_maskz_sub_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 }
 __m128d test_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_sub_sd
-  // CHECK: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fsub double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_mask_sub_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_sub_sd
-  // CHECK: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fsub double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_maskz_sub_sd(__U,__A,__B); 
 }
 __m512d test_mm512_mul_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mul_round_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd.512
+  // CHECK: @llvm.x86.avx512.mul.pd.512
   return _mm512_mul_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_mul_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_mul_round_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd.512
+  // CHECK: @llvm.x86.avx512.mul.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_mul_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_mul_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_mul_round_pd
-  // CHECK: @llvm.x86.avx512.mask.mul.pd.512
+  // CHECK: @llvm.x86.avx512.mul.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_mul_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -2167,17 +3397,19 @@ __m512d test_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
 }
 __m512 test_mm512_mul_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mul_round_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps.512
+  // CHECK: @llvm.x86.avx512.mul.ps.512
   return _mm512_mul_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_mul_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_mul_round_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps.512
+  // CHECK: @llvm.x86.avx512.mul.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_mul_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_mul_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_mul_round_ps
-  // CHECK: @llvm.x86.avx512.mask.mul.ps.512
+  // CHECK: @llvm.x86.avx512.mul.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_mul_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
@@ -2209,12 +3441,32 @@ __m128 test_mm_maskz_mul_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 }
 __m128 test_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_mul_ss
-  // CHECK: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fmul float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_mask_mul_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_ss
-  // CHECK: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fmul float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_maskz_mul_ss(__U,__A,__B); 
 }
 __m128d test_mm_mul_round_sd(__m128d __A, __m128d __B) {
@@ -2234,36 +3486,58 @@ __m128d test_mm_maskz_mul_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 }
 __m128d test_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_mul_sd
-  // CHECK: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fmul double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_mask_mul_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_mul_sd
-  // CHECK: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fmul double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> {{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_maskz_mul_sd(__U,__A,__B); 
 }
 __m512d test_mm512_div_round_pd(__m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_div_round_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.512
+  // CHECK: @llvm.x86.avx512.div.pd.512
   return _mm512_div_round_pd(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_mask_div_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask_div_round_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.512
+  // CHECK: @llvm.x86.avx512.div.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_div_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_maskz_div_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_round_pd
-  // CHECK: @llvm.x86.avx512.mask.div.pd.512
+  // CHECK: @llvm.x86.avx512.div.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_div_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512d test_mm512_div_pd(__m512d __a, __m512d __b) {
-  // CHECK-LABLE: @test_mm512_div_pd
+  // CHECK-LABEL: @test_mm512_div_pd
   // CHECK: fdiv <8 x double>
   return _mm512_div_pd(__a,__b); 
 }
 __m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d __b) {
-  // CHECK-LABLE: @test_mm512_mask_div_pd
+  // CHECK-LABEL: @test_mm512_mask_div_pd
   // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_div_pd(__w,__u,__a,__b); 
@@ -2276,17 +3550,19 @@ __m512d test_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
 }
 __m512 test_mm512_div_round_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_div_round_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.512
+  // CHECK: @llvm.x86.avx512.div.ps.512
   return _mm512_div_round_ps(__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_mask_div_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_div_round_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.512
+  // CHECK: @llvm.x86.avx512.div.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_div_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_round_ps
-  // CHECK: @llvm.x86.avx512.mask.div.ps.512
+  // CHECK: @llvm.x86.avx512.div.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_div_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 __m512 test_mm512_div_ps(__m512 __A, __m512 __B) {
@@ -2323,12 +3599,30 @@ __m128 test_mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
 }
 __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_div_ss
-  // CHECK: @llvm.x86.avx512.mask.div.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fdiv float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_mask_div_ss(__W,__U,__A,__B); 
 }
 __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_div_ss
-  // CHECK: @llvm.x86.avx512.mask.div.ss.round
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fdiv float %{{.*}}, %{{.*}}
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_maskz_div_ss(__U,__A,__B); 
 }
 __m128d test_mm_div_round_sd(__m128d __A, __m128d __B) {
@@ -2348,12 +3642,30 @@ __m128d test_mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
 }
 __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_div_sd
-  // CHECK: @llvm.x86.avx512.mask.div.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fdiv double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_mask_div_sd(__W,__U,__A,__B); 
 }
 __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_div_sd
-  // CHECK: @llvm.x86.avx512.mask.div.sd.round
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fdiv double %{{.*}}, %{{.*}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_maskz_div_sd(__U,__A,__B); 
 }
 __m128 test_mm_max_round_ss(__m128 __A, __m128 __B) {
@@ -2684,146 +3996,162 @@ __m512i test_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
 
 __m512i test_mm512_rol_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  // CHECK: @llvm.x86.avx512.prol.d.512
   return _mm512_rol_epi32(__A, 5); 
 }
 
 __m512i test_mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  // CHECK: @llvm.x86.avx512.prol.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_rol_epi32(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  // CHECK: @llvm.x86.avx512.prol.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_rol_epi32(__U, __A, 5); 
 }
 
 __m512i test_mm512_rol_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  // CHECK: @llvm.x86.avx512.prol.q.512
   return _mm512_rol_epi64(__A, 5); 
 }
 
 __m512i test_mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  // CHECK: @llvm.x86.avx512.prol.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_rol_epi64(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  // CHECK: @llvm.x86.avx512.prol.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_rol_epi64(__U, __A, 5); 
 }
 
 __m512i test_mm512_rolv_epi32(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  // CHECK: @llvm.x86.avx512.prolv.d.512
   return _mm512_rolv_epi32(__A, __B); 
 }
 
 __m512i test_mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  // CHECK: @llvm.x86.avx512.prolv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_rolv_epi32(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  // CHECK: @llvm.x86.avx512.prolv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_rolv_epi32(__U, __A, __B); 
 }
 
 __m512i test_mm512_rolv_epi64(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  // CHECK: @llvm.x86.avx512.prolv.q.512
   return _mm512_rolv_epi64(__A, __B); 
 }
 
 __m512i test_mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  // CHECK: @llvm.x86.avx512.prolv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_rolv_epi64(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  // CHECK: @llvm.x86.avx512.prolv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_rolv_epi64(__U, __A, __B); 
 }
 
 __m512i test_mm512_ror_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  // CHECK: @llvm.x86.avx512.pror.d.512
   return _mm512_ror_epi32(__A, 5); 
 }
 
 __m512i test_mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  // CHECK: @llvm.x86.avx512.pror.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_ror_epi32(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  // CHECK: @llvm.x86.avx512.pror.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_ror_epi32(__U, __A, 5); 
 }
 
 __m512i test_mm512_ror_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  // CHECK: @llvm.x86.avx512.pror.q.512
   return _mm512_ror_epi64(__A, 5); 
 }
 
 __m512i test_mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  // CHECK: @llvm.x86.avx512.pror.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_ror_epi64(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  // CHECK: @llvm.x86.avx512.pror.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_ror_epi64(__U, __A, 5); 
 }
 
 
 __m512i test_mm512_rorv_epi32(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  // CHECK: @llvm.x86.avx512.prorv.d.512
   return _mm512_rorv_epi32(__A, __B); 
 }
 
 __m512i test_mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  // CHECK: @llvm.x86.avx512.prorv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_rorv_epi32(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  // CHECK: @llvm.x86.avx512.prorv.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_rorv_epi32(__U, __A, __B); 
 }
 
 __m512i test_mm512_rorv_epi64(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  // CHECK: @llvm.x86.avx512.prorv.q.512
   return _mm512_rorv_epi64(__A, __B); 
 }
 
 __m512i test_mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  // CHECK: @llvm.x86.avx512.prorv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_rorv_epi64(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  // CHECK: @llvm.x86.avx512.prorv.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_rorv_epi64(__U, __A, __B); 
 }
 
@@ -3211,7 +4539,8 @@ unsigned long long test_mm_cvt_roundsd_si64(__m128d __A) {
 #endif
 __m512i test_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.d.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask2_permutex2var_epi32(__A, __I, __U, __B); 
 }
 __m512i test_mm512_unpackhi_epi32(__m512i __A, __m512i __B) {
@@ -3235,7 +4564,8 @@ long long test_mm_cvt_roundsd_i64(__m128d __A) {
 #endif
 __m512d test_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, __m512d __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.pd.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask2_permutex2var_pd(__A, __I, __U, __B); 
 }
 __m512i test_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
@@ -3337,7 +4667,7 @@ int test_mm_cvt_roundss_i32(__m128 __A) {
 }
 
 #ifdef __x86_64__
-int test_mm_cvt_roundss_si64(__m128 __A) {
+long long test_mm_cvt_roundss_si64(__m128 __A) {
   // CHECK-LABEL: @test_mm_cvt_roundss_si64
   // CHECK: @llvm.x86.avx512.vcvtss2si64
   return _mm_cvt_roundss_si64(__A, _MM_FROUND_CUR_DIRECTION);
@@ -3703,52 +5033,54 @@ __m256i test_mm512_maskz_cvt_roundpd_epu32(__mmask8 U, __m512d A)
 
 __m512 test_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.ps.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask2_permutex2var_ps(__A, __I, __U, __B); 
 }
 
 __m512i test_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.q.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask2_permutex2var_epi64(__A, __I, __U, __B); 
 }
 
 __m512d test_mm512_permute_pd(__m512d __X) {
   // CHECK-LABEL: @test_mm512_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   return _mm512_permute_pd(__X, 2);
 }
 
 __m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) {
   // CHECK-LABEL: @test_mm512_mask_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permute_pd(__W, __U, __X, 2);
 }
 
 __m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) {
   // CHECK-LABEL: @test_mm512_maskz_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_permute_pd(__U, __X, 2);
 }
 
 __m512 test_mm512_permute_ps(__m512 __X) {
   // CHECK-LABEL: @test_mm512_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
   return _mm512_permute_ps(__X, 2);
 }
 
 __m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) {
   // CHECK-LABEL: @test_mm512_mask_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_permute_ps(__W, __U, __X, 2);
 }
 
 __m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) {
   // CHECK-LABEL: @test_mm512_maskz_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_permute_ps(__U, __X, 2);
 }
@@ -3793,68 +5125,88 @@ __m512 test_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) {
   return _mm512_maskz_permutevar_ps(__U, __A, __C); 
 }
 
+__m512i test_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_permutex2var_epi32
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
+  return _mm512_permutex2var_epi32(__A, __I, __B); 
+}
+
 __m512i test_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.d.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_permutex2var_epi32(__U, __A, __I, __B); 
 }
 
 __m512i test_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, __m512i __I, __m512i __B)
 {
   // CHECK-LABEL: @test_mm512_mask_permutex2var_epi32 
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.d.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_permutex2var_epi32 (__A,__U,__I,__B);
 }
 
 __m512d test_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
 {
   // CHECK-LABEL: @test_mm512_permutex2var_pd 
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
   return _mm512_permutex2var_pd (__A, __I,__B);
 }
 
 __m512d test_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
 {
   // CHECK-LABEL: @test_mm512_mask_permutex2var_pd 
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permutex2var_pd (__A,__U,__I,__B);
 }
 
 __m512d test_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.pd.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_permutex2var_pd(__U, __A, __I, __B); 
 }
 
 __m512 test_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
 {
   // CHECK-LABEL: @test_mm512_permutex2var_ps 
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
   return _mm512_permutex2var_ps (__A, __I, __B);
 }
 
 __m512 test_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
 {
   // CHECK-LABEL: @test_mm512_mask_permutex2var_ps 
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_permutex2var_ps (__A,__U,__I,__B);
 }
 
 __m512 test_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) {
   // CHECK-LABEL: @test_mm512_maskz_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.ps.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_permutex2var_ps(__U, __A, __I, __B); 
 }
 
+__m512i test_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B){
+  // CHECK-LABEL: @test_mm512_permutex2var_epi64
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
+  return _mm512_permutex2var_epi64(__A, __I, __B);
+}
+
 __m512i test_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, __m512i __B){
   // CHECK-LABEL: @test_mm512_mask_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.q.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_permutex2var_epi64(__A, __U, __I, __B);
 }
 
 __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.q.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
 }
 __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
@@ -4015,19 +5367,19 @@ __m128 test_mm_roundscale_ss(__m128 __A, __m128 __B) {
   return _mm_roundscale_ss(__A, __B, 3); 
 }
 
-__m128 test_mm_mask_roundscale_ss(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+__m128 test_mm_mask_roundscale_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_roundscale_ss
   // CHECK: @llvm.x86.avx512.mask.rndscale.ss
     return _mm_mask_roundscale_ss(__W,__U,__A,__B,3);
 }
 
-__m128 test_mm_maskz_roundscale_round_ss( __mmask8 __U, __m128d __A, __m128d __B){
+__m128 test_mm_maskz_roundscale_round_ss( __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_maskz_roundscale_round_ss
   // CHECK: @llvm.x86.avx512.mask.rndscale.ss
     return _mm_maskz_roundscale_round_ss(__U,__A,__B,3,_MM_FROUND_CUR_DIRECTION);
 }
 
-__m128 test_mm_maskz_roundscale_ss(__mmask8 __U, __m128d __A, __m128d __B){
+__m128 test_mm_maskz_roundscale_ss(__mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_maskz_roundscale_ss
   // CHECK: @llvm.x86.avx512.mask.rndscale.ss
     return _mm_maskz_roundscale_ss(__U,__A,__B,3);
@@ -4107,8 +5459,8 @@ __m512 test_mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 
 __m128d test_mm_scalef_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_scalef_round_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef
-  return _mm_scalef_round_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %2, i8 -1, i32 8)
+  return _mm_scalef_round_sd(__A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_scalef_sd(__m128d __A, __m128d __B) {
@@ -4125,8 +5477,8 @@ __m128d test_mm_mask_scalef_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d _
 
 __m128d test_mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_scalef_round_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef.sd
-    return _mm_mask_scalef_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 8)
+    return _mm_mask_scalef_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_maskz_scalef_sd(__mmask8 __U, __m128d __A, __m128d __B){
@@ -4137,14 +5489,14 @@ __m128d test_mm_maskz_scalef_sd(__mmask8 __U, __m128d __A, __m128d __B){
 
 __m128d test_mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_maskz_scalef_round_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef.sd
-    return _mm_maskz_scalef_round_sd(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 8)
+    return _mm_maskz_scalef_round_sd(__U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_scalef_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_scalef_round_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss
-  return _mm_scalef_round_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 -1, i32 8)
+  return _mm_scalef_round_ss(__A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_scalef_ss(__m128 __A, __m128 __B) {
@@ -4161,8 +5513,8 @@ __m128 test_mm_mask_scalef_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
 
 __m128 test_mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_scalef_round_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss
-    return _mm_mask_scalef_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 8)
+    return _mm_mask_scalef_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_maskz_scalef_ss(__mmask8 __U, __m128 __A, __m128 __B){
@@ -4173,8 +5525,8 @@ __m128 test_mm_maskz_scalef_ss(__mmask8 __U, __m128 __A, __m128 __B){
 
 __m128 test_mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_maskz_scalef_round_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss
-    return _mm_maskz_scalef_round_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 8)
+    return _mm_maskz_scalef_round_ss(__U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512i test_mm512_srai_epi32(__m512i __A) {
@@ -4459,37 +5811,41 @@ __m512i test_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
 
 __m512i test_mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C) {
   // CHECK-LABEL: @test_mm512_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.mask.pternlog.d.512
+  // CHECK: @llvm.x86.avx512.pternlog.d.512
   return _mm512_ternarylogic_epi32(__A, __B, __C, 4); 
 }
 
 __m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
   // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.mask.pternlog.d.512
+  // CHECK: @llvm.x86.avx512.pternlog.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
 }
 
 __m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.512
+  // CHECK: @llvm.x86.avx512.pternlog.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
 }
 
 __m512i test_mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C) {
   // CHECK-LABEL: @test_mm512_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.mask.pternlog.q.512
+  // CHECK: @llvm.x86.avx512.pternlog.q.512
   return _mm512_ternarylogic_epi64(__A, __B, __C, 4); 
 }
 
 __m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) {
   // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.mask.pternlog.q.512
+  // CHECK: @llvm.x86.avx512.pternlog.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
 }
 
 __m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) {
   // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.512
+  // CHECK: @llvm.x86.avx512.pternlog.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> zeroinitializer
   return _mm512_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
 }
 
@@ -4535,20 +5891,20 @@ __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) {
 
 __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shuffle_i32x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   return _mm512_shuffle_i32x4(__A, __B, 4); 
 }
 
 __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); 
 }
 
 __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); 
 }
@@ -4615,54 +5971,86 @@ __m512 test_mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V) {
 
 __m128d test_mm_sqrt_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: @test_mm_sqrt_round_sd
-  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
-  return _mm_sqrt_round_sd(__A, __B, 4); 
+  // CHECK: call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 -1, i32 8)
+  return _mm_sqrt_round_sd(__A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m128d test_mm_mask_sqrt_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
-    return _mm_mask_sqrt_sd(__W,__U,__A,__B);
+  // CHECK-LABEL: @test_mm_mask_sqrt_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: call double @llvm.sqrt.f64(double %{{.*}})
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 {{.*}}, double {{.*}}, double {{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double {{.*}}, i64 0
+  return _mm_mask_sqrt_sd(__W,__U,__A,__B);
 }
 
 __m128d test_mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
-    return _mm_mask_sqrt_round_sd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+  // CHECK-LABEL: @test_mm_mask_sqrt_round_sd
+  // CHECK: call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 8)
+  return _mm_mask_sqrt_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_maskz_sqrt_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
-    return _mm_maskz_sqrt_sd(__U,__A,__B);
+  // CHECK-LABEL: @test_mm_maskz_sqrt_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: call double @llvm.sqrt.f64(double %{{.*}})
+  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 {{.*}}, double {{.*}}, double {{.*}}
+  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double {{.*}}, i64 0
+  return _mm_maskz_sqrt_sd(__U,__A,__B);
 }
 
 __m128d test_mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
-    return _mm_maskz_sqrt_round_sd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+  // CHECK-LABEL: @test_mm_maskz_sqrt_round_sd
+  // CHECK: call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 8)
+  return _mm_maskz_sqrt_round_sd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_sqrt_round_ss(__m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_sqrt_round_ss
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
-  return _mm_sqrt_round_ss(__A, __B, 4); 
+  // CHECK: call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 -1, i32 8)
+  return _mm_sqrt_round_ss(__A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); 
 }
 
 __m128 test_mm_mask_sqrt_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
-    return _mm_mask_sqrt_ss(__W,__U,__A,__B);
+  // CHECK-LABEL: @test_mm_mask_sqrt_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: call float @llvm.sqrt.f32(float %{{.*}})
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 {{.*}}, float {{.*}}, float {{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float {{.*}}, i64 0
+  return _mm_mask_sqrt_ss(__W,__U,__A,__B);
 }
 
 __m128 test_mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
-    return _mm_mask_sqrt_round_ss(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+  // CHECK-LABEL: @test_mm_mask_sqrt_round_ss
+  // CHECK: call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 {{.*}}, i32 8)
+  return _mm_mask_sqrt_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_maskz_sqrt_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
-    return _mm_maskz_sqrt_ss(__U,__A,__B);
+  // CHECK-LABEL: @test_mm_maskz_sqrt_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: call float @llvm.sqrt.f32(float %{{.*}})
+  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: select i1 {{.*}}, float {{.*}}, float {{.*}}
+  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float {{.*}}, i64 0
+  return _mm_maskz_sqrt_ss(__U,__A,__B);
 }
 
 __m128 test_mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
-    return _mm_maskz_sqrt_round_ss(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+  // CHECK-LABEL: @test_mm_maskz_sqrt_round_ss
+  // CHECK: call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 {{.*}}, i32 8)
+  return _mm_maskz_sqrt_round_ss(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512 test_mm512_broadcast_f32x4(float const* __A) {
@@ -4685,24 +6073,24 @@ __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) {
   return _mm512_maskz_broadcast_f32x4(__M, _mm_loadu_ps(__A)); 
 }
 
-__m512d test_mm512_broadcast_f64x4(float const* __A) {
+__m512d test_mm512_broadcast_f64x4(double const* __A) {
   // CHECK-LABEL: @test_mm512_broadcast_f64x4
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  return _mm512_broadcast_f64x4(_mm256_loadu_ps(__A)); 
+  return _mm512_broadcast_f64x4(_mm256_loadu_pd(__A)); 
 }
 
-__m512d test_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, float const* __A) {
+__m512d test_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, double const* __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcast_f64x4
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_broadcast_f64x4(__O, __M, _mm256_loadu_ps(__A)); 
+  return _mm512_mask_broadcast_f64x4(__O, __M, _mm256_loadu_pd(__A)); 
 }
 
-__m512d test_mm512_maskz_broadcast_f64x4(__mmask8 __M, float const* __A) {
+__m512d test_mm512_maskz_broadcast_f64x4(__mmask8 __M, double const* __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcast_f64x4
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_broadcast_f64x4(__M, _mm256_loadu_ps(__A)); 
+  return _mm512_maskz_broadcast_f64x4(__M, _mm256_loadu_pd(__A)); 
 }
 
 __m512i test_mm512_broadcast_i32x4(__m128i const* __A) {
@@ -4747,80 +6135,80 @@ __m512i test_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i const* __A) {
 
 __m512d test_mm512_broadcastsd_pd(__m128d __A) {
   // CHECK-LABEL: @test_mm512_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <8 x i32> zeroinitializer
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> zeroinitializer
   return _mm512_broadcastsd_pd(__A);
 }
 
 __m512d test_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <8 x i32> zeroinitializer
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> zeroinitializer
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_broadcastsd_pd(__O, __M, __A);
 }
 
 __m512d test_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <8 x i32> zeroinitializer
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> zeroinitializer
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_broadcastsd_pd(__M, __A);
 }
 
 __m512 test_mm512_broadcastss_ps(__m128 __A) {
   // CHECK-LABEL: @test_mm512_broadcastss_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <16 x i32> zeroinitializer
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_broadcastss_ps(__A);
 }
 
 __m512 test_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcastss_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <16 x i32> zeroinitializer
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> zeroinitializer
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_broadcastss_ps(__O, __M, __A);
 }
 
 __m512 test_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcastss_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <16 x i32> zeroinitializer
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> zeroinitializer
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_broadcastss_ps(__M, __A);
 }
 
 __m512i test_mm512_broadcastd_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm512_broadcastd_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <16 x i32> zeroinitializer
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_broadcastd_epi32(__A);
 }
 
 __m512i test_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcastd_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <16 x i32> zeroinitializer
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> zeroinitializer
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_broadcastd_epi32(__O, __M, __A);
 }
 
 __m512i test_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcastd_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <16 x i32> zeroinitializer
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> zeroinitializer
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_broadcastd_epi32(__M, __A);
 }
 
 __m512i test_mm512_broadcastq_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm512_broadcastq_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> zeroinitializer, <8 x i32> zeroinitializer
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> zeroinitializer
   return _mm512_broadcastq_epi64(__A);
 }
 
 __m512i test_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_mask_broadcastq_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> zeroinitializer, <8 x i32> zeroinitializer
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> zeroinitializer
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_broadcastq_epi64(__O, __M, __A);
 }
 
 __m512i test_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm512_maskz_broadcastq_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> zeroinitializer, <8 x i32> zeroinitializer
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> zeroinitializer
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_broadcastq_epi64(__M, __A);
 }
@@ -5067,7 +6455,7 @@ void test_mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __
 
 __m128i test_mm512_cvtepi32_epi8(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
+  // CHECK: trunc <16 x i32> %{{.*}} to <16 x i8>
   return _mm512_cvtepi32_epi8(__A); 
 }
 
@@ -5091,7 +6479,7 @@ void test_mm512_mask_cvtepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A
 
 __m256i test_mm512_cvtepi32_epi16(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
+  // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16>
   return _mm512_cvtepi32_epi16(__A); 
 }
 
@@ -5139,19 +6527,21 @@ void test_mm512_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A)
 
 __m256i test_mm512_cvtepi64_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
   return _mm512_cvtepi64_epi32(__A); 
 }
 
 __m256i test_mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_mask_cvtepi64_epi32(__O, __M, __A); 
 }
 
 __m256i test_mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_maskz_cvtepi64_epi32(__M, __A); 
 }
 
@@ -5163,7 +6553,7 @@ void test_mm512_mask_cvtepi64_storeu_epi32(void* __P, __mmask8 __M, __m512i __A)
 
 __m128i test_mm512_cvtepi64_epi16(__m512i __A) {
   // CHECK-LABEL: @test_mm512_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
+  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i16>
   return _mm512_cvtepi64_epi16(__A); 
 }
 
@@ -5187,40 +6577,40 @@ void test_mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A)
 
 __m128i test_mm512_extracti32x4_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x4_epi32(__A, 3); 
 }
 
 __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); 
 }
 
 __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); 
 }
 
 __m256i test_mm512_extracti64x4_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extracti64x4_epi64(__A, 1); 
 }
 
 __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); 
 }
 
 __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); 
 }
@@ -5815,401 +7205,889 @@ __m128 test_mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
 
 __m128 test_mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fmadd_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fmadd_ss(__W, __U, __A, __B);
 }
 
+__m128 test_mm_fmadd_round_ss(__m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_fmadd_round_ss
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
+  return _mm_fmadd_round_ss(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128 test_mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fmadd_round_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
-  return _mm_mask_fmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_mask_fmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_maskz_fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fmadd_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fmadd_ss(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fmadd_round_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
-  return _mm_maskz_fmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_maskz_fmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_mask3_fmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmadd_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fmadd_ss(__W, __X, __Y, __U);
 }
 
 __m128 test_mm_mask3_fmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmadd_round_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
-  return _mm_mask3_fmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
+  return _mm_mask3_fmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fmsub_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fmsub_ss(__W, __U, __A, __B);
 }
 
+__m128 test_mm_fmsub_round_ss(__m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_fmsub_round_ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
+  return _mm_fmsub_round_ss(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128 test_mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fmsub_round_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
-  return _mm_mask_fmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_mask_fmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_maskz_fmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fmsub_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fmsub_ss(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fmsub_round_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
-  return _mm_maskz_fmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_maskz_fmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_mask3_fmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fmsub_ss(__W, __X, __Y, __U);
 }
 
 __m128 test_mm_mask3_fmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_round_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ss
-  return _mm_mask3_fmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
+  return _mm_mask3_fmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fnmadd_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fnmadd_ss(__W, __U, __A, __B);
 }
 
+__m128 test_mm_fnmadd_round_ss(__m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_fnmadd_round_ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
+  return _mm_fnmadd_round_ss(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128 test_mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fnmadd_round_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
-  return _mm_mask_fnmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_mask_fnmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fnmadd_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fnmadd_ss(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fnmadd_round_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
-  return _mm_maskz_fnmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_maskz_fnmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_mask3_fnmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmadd_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fnmadd_ss(__W, __X, __Y, __U);
 }
 
 __m128 test_mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmadd_round_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
-  return _mm_mask3_fnmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
+  return _mm_mask3_fnmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fnmsub_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fnmsub_ss(__W, __U, __A, __B);
 }
 
+__m128 test_mm_fnmsub_round_ss(__m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_fnmsub_round_ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
+  return _mm_fnmsub_round_ss(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128 test_mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
   // CHECK-LABEL: @test_mm_mask_fnmsub_round_ss
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
-  return _mm_mask_fnmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_mask_fnmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fnmsub_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fnmsub_ss(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
   // CHECK-LABEL: @test_mm_maskz_fnmsub_round_ss
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
-  return _mm_maskz_fnmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
+  return _mm_maskz_fnmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128 test_mm_mask3_fnmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ss
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fnmsub_ss(__W, __X, __Y, __U);
 }
 
 __m128 test_mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_round_ss
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ss
-  return _mm_mask3_fnmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 8)
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
+  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
+  return _mm_mask3_fnmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fmadd_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fmadd_sd(__W, __U, __A, __B);
 }
 
+__m128d test_mm_fmadd_round_sd(__m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_fmadd_round_sd
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
+  return _mm_fmadd_round_sd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128d test_mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fmadd_round_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
-  return _mm_mask_fmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_mask_fmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_maskz_fmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fmadd_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fmadd_sd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fmadd_round_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
-  return _mm_maskz_fmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_maskz_fmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask3_fmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmadd_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fmadd_sd(__W, __X, __Y, __U);
 }
 
 __m128d test_mm_mask3_fmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmadd_round_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
-  return _mm_mask3_fmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
+  return _mm_mask3_fmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fmsub_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fmsub_sd(__W, __U, __A, __B);
 }
 
+__m128d test_mm_fmsub_round_sd(__m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_fmsub_round_sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
+  return _mm_fmsub_round_sd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128d test_mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fmsub_round_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
-  return _mm_mask_fmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_mask_fmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_maskz_fmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fmsub_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fmsub_sd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fmsub_round_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
-  return _mm_maskz_fmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_maskz_fmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask3_fmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fmsub_sd(__W, __X, __Y, __U);
 }
 
 __m128d test_mm_mask3_fmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fmsub_round_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.sd
-  return _mm_mask3_fmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
+  return _mm_mask3_fmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fnmadd_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fnmadd_sd(__W, __U, __A, __B);
 }
 
+__m128d test_mm_fnmadd_round_sd(__m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_fnmadd_round_sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
+  return _mm_fnmadd_round_sd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128d test_mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fnmadd_round_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
-  return _mm_mask_fnmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_mask_fnmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fnmadd_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fnmadd_sd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fnmadd_round_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
-  return _mm_maskz_fnmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_maskz_fnmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask3_fnmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmadd_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fnmadd_sd(__W, __X, __Y, __U);
 }
 
 __m128d test_mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmadd_round_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
-  return _mm_mask3_fnmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
+  return _mm_mask3_fnmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fnmsub_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fnmsub_sd(__W, __U, __A, __B);
 }
 
+__m128d test_mm_fnmsub_round_sd(__m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_fnmsub_round_sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
+  return _mm_fnmsub_round_sd(__A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
 __m128d test_mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
   // CHECK-LABEL: @test_mm_mask_fnmsub_round_sd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
-  return _mm_mask_fnmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_mask_fnmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fnmsub_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fnmsub_sd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
   // CHECK-LABEL: @test_mm_maskz_fnmsub_round_sd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
-  return _mm_maskz_fnmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
+  return _mm_maskz_fnmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m128d test_mm_mask3_fnmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.sd
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fnmsub_sd(__W, __X, __Y, __U);
 }
 
 __m128d test_mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
   // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sd
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.sd
-  return _mm_mask3_fnmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[ORIGC:%.+]]
+  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 8)
+  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
+  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
+  return _mm_mask3_fnmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
 __m512d test_mm512_permutex_pd(__m512d __X) {
   // CHECK-LABEL: @test_mm512_permutex_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   return _mm512_permutex_pd(__X, 0);
 }
 
 __m512d test_mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X) {
   // CHECK-LABEL: @test_mm512_mask_permutex_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permutex_pd(__W, __U, __X, 0);
 }
 
 __m512d test_mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X) {
   // CHECK-LABEL: @test_mm512_maskz_permutex_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_permutex_pd(__U, __X, 0);
 }
 
 __m512i test_mm512_permutex_epi64(__m512i __X) {
   // CHECK-LABEL: @test_mm512_permutex_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   return _mm512_permutex_epi64(__X, 0);
 }
 
 __m512i test_mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, __m512i __X) {
   // CHECK-LABEL: @test_mm512_mask_permutex_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_permutex_epi64(__W, __M, __X, 0);
 }
 
 __m512i test_mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X) {
   // CHECK-LABEL: @test_mm512_maskz_permutex_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutex_epi64(__M, __X, 0);
 }
 
 __m512d test_mm512_permutexvar_pd(__m512i __X, __m512d __Y) {
   // CHECK-LABEL: @test_mm512_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  // CHECK: @llvm.x86.avx512.permvar.df.512
   return _mm512_permutexvar_pd(__X, __Y); 
 }
 
 __m512d test_mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) {
   // CHECK-LABEL: @test_mm512_mask_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  // CHECK: @llvm.x86.avx512.permvar.df.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permutexvar_pd(__W, __U, __X, __Y); 
 }
 
 __m512d test_mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) {
   // CHECK-LABEL: @test_mm512_maskz_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  // CHECK: @llvm.x86.avx512.permvar.df.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_permutexvar_pd(__U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  // CHECK: @llvm.x86.avx512.permvar.di.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutexvar_epi64(__M, __X, __Y); 
 }
 
 __m512i test_mm512_permutexvar_epi64(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  // CHECK: @llvm.x86.avx512.permvar.di.512
   return _mm512_permutexvar_epi64(__X, __Y); 
 }
 
 __m512i test_mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  // CHECK: @llvm.x86.avx512.permvar.di.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_permutexvar_epi64(__W, __M, __X, __Y); 
 }
 
 __m512 test_mm512_permutexvar_ps(__m512i __X, __m512 __Y) {
   // CHECK-LABEL: @test_mm512_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  // CHECK: @llvm.x86.avx512.permvar.sf.512
   return _mm512_permutexvar_ps(__X, __Y); 
 }
 
 __m512 test_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) {
   // CHECK-LABEL: @test_mm512_mask_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  // CHECK: @llvm.x86.avx512.permvar.sf.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_permutexvar_ps(__W, __U, __X, __Y); 
 }
 
 __m512 test_mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) {
   // CHECK-LABEL: @test_mm512_maskz_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  // CHECK: @llvm.x86.avx512.permvar.sf.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_permutexvar_ps(__U, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  // CHECK: @llvm.x86.avx512.permvar.si.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_permutexvar_epi32(__M, __X, __Y); 
 }
 
 __m512i test_mm512_permutexvar_epi32(__m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  // CHECK: @llvm.x86.avx512.permvar.si.512
   return _mm512_permutexvar_epi32(__X, __Y); 
 }
 
 __m512i test_mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  // CHECK: @llvm.x86.avx512.permvar.si.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_permutexvar_epi32(__W, __M, __X, __Y); 
 }
 
@@ -6247,26 +8125,38 @@ __mmask16 test_mm512_kor(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m
                                                   __E, __F);
 }
 
-int test_mm512_kortestc(__mmask16 __A, __mmask16 __B) {
+int test_mm512_kortestc(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: @test_mm512_kortestc
-  // CHECK: @llvm.x86.avx512.kortestc.w
-  return _mm512_kortestc(__A, __B); 
+  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[OR:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
+  // CHECK: [[CAST:%.*]] = bitcast <16 x i1> [[OR]] to i16
+  // CHECK: [[CMP:%.*]] = icmp eq i16 [[CAST]], -1
+  // CHECK: zext i1 [[CMP]] to i32
+  return _mm512_kortestc(_mm512_cmpneq_epu32_mask(__A, __B),
+                         _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
-int test_mm512_kortestz(__mmask16 __A, __mmask16 __B) {
+int test_mm512_kortestz(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: @test_mm512_kortestz
-  // CHECK: @llvm.x86.avx512.kortestz.w
-  return _mm512_kortestz(__A, __B); 
+  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[OR:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
+  // CHECK: [[CAST:%.*]] = bitcast <16 x i1> [[OR]] to i16
+  // CHECK: [[CMP:%.*]] = icmp eq i16 [[CAST]], 0
+  // CHECK: zext i1 [[CMP]] to i32
+  return _mm512_kortestz(_mm512_cmpneq_epu32_mask(__A, __B),
+                         _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
 __mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: bitcast <16 x i1> %{{.*}} to i16
-  // CHECK: bitcast <16 x i1> %{{.*}} to i16
-  // CHECK: and i32 %{{.*}}, 255
-  // CHECK: shl i32 %{{.*}}, 8
-  // CHECK: or i32 %{{.*}}, %{{.*}}
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: [[LHS2:%.*]] = shufflevector <16 x i1> [[LHS]], <16 x i1> [[LHS]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: [[RHS2:%.*]] = shufflevector <16 x i1> [[RHS]], <16 x i1> [[RHS]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: [[CONCAT:%.*]] = shufflevector <8 x i1> [[RHS2]], <8 x i1> [[LHS2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: bitcast <16 x i1> [[CONCAT]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, __B),
                                                        _mm512_cmpneq_epu32_mask(__C, __D)),
                                                        __E, __F);
@@ -6463,20 +8353,20 @@ __m512 test_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) {
 
 __m512i test_mm512_shuffle_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_shuffle_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   return _mm512_shuffle_epi32(__A, 1); 
 }
 
 __m512i test_mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_shuffle_epi32(__W, __U, __A, 1); 
 }
 
 __m512i test_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_shuffle_epi32(__U, __A, 1); 
 }
@@ -6506,40 +8396,52 @@ __m512i test_mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) {
 }
 __m512i test_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm512_mask_expandloadu_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand.load.q.512
+  // CHECK: @llvm.masked.expandload.v8i64(i64* %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_mask_expandloadu_epi64(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand.load.q.512
+  // CHECK: @llvm.masked.expandload.v8i64(i64* %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_maskz_expandloadu_epi64(__U, __P); 
 }
 
 __m512d test_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm512_mask_expandloadu_pd
-  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.512
+  // CHECK: @llvm.masked.expandload.v8f64(double* %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_mask_expandloadu_pd(__W, __U, __P); 
 }
 
 __m512d test_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm512_maskz_expandloadu_pd
-  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.512
+  // CHECK: @llvm.masked.expandload.v8f64(double* %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_maskz_expandloadu_pd(__U, __P); 
 }
 
 __m512i test_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) {
   // CHECK-LABEL: @test_mm512_mask_expandloadu_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand.load.d.512
+  // CHECK: @llvm.masked.expandload.v16i32(i32* %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_mask_expandloadu_epi32(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) {
   // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand.load.d.512
+  // CHECK: @llvm.masked.expandload.v16i32(i32* %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_maskz_expandloadu_epi32(__U, __P); 
 }
 
+__m512 test_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_expandloadu_ps
+  // CHECK: @llvm.masked.expandload.v16f32(float* %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_mask_expandloadu_ps(__W, __U, __P); 
+}
+
+__m512 test_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_expandloadu_ps
+  // CHECK: @llvm.masked.expandload.v16f32(float* %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_maskz_expandloadu_ps(__U, __P); 
+}
+
 __m512 test_mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_mask_expand_ps
   // CHECK: @llvm.x86.avx512.mask.expand.ps.512
@@ -6583,33 +8485,36 @@ __m512d test_mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A) {
 
 __m512d test_mm512_cvtps_pd(__m256 __A) {
   // CHECK-LABEL: @test_mm512_cvtps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
   return _mm512_cvtps_pd(__A); 
 }
 
 __m512d test_mm512_cvtpslo_pd(__m512 __A) {
   // CHECK-LABEL: @test_mm512_cvtpslo_pd
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
   return _mm512_cvtpslo_pd(__A);
 }
 
 __m512d test_mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvtps_pd(__W, __U, __A); 
 }
 
 __m512d test_mm512_mask_cvtpslo_pd(__m512d __W, __mmask8 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_mask_cvtpslo_pd
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvtpslo_pd(__W, __U, __A);
 }
 
 __m512d test_mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm512_maskz_cvtps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_cvtps_pd(__U, __A); 
 }
 __m512d test_mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) {
@@ -6638,25 +8543,25 @@ __m512 test_mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) {
 
 void test_mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_compressstoreu_pd
-  // CHECK: @llvm.x86.avx512.mask.compress.store.pd.512
+  // CHECK: @llvm.masked.compressstore.v8f64(<8 x double> %{{.*}}, double* %{{.*}}, <8 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_pd(__P, __U, __A); 
 }
 
 void test_mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi64
-  // CHECK: @llvm.x86.avx512.mask.compress.store.q.512
+  // CHECK: @llvm.masked.compressstore.v8i64(<8 x i64> %{{.*}}, i64* %{{.*}}, <8 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_epi64(__P, __U, __A); 
 }
 
 void test_mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_mask_compressstoreu_ps
-  // CHECK: @llvm.x86.avx512.mask.compress.store.ps.512
+  // CHECK: @llvm.masked.compressstore.v16f32(<16 x float> %{{.*}}, float* %{{.*}}, <16 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_ps(__P, __U, __A); 
 }
 
 void test_mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi32
-  // CHECK: @llvm.x86.avx512.mask.compress.store.d.512
+  // CHECK: @llvm.masked.compressstore.v16i32(<16 x i32> %{{.*}}, i32* %{{.*}}, <16 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_epi32(__P, __U, __A); 
 }
 
@@ -6729,7 +8634,7 @@ __m512d test_mm512_castpd128_pd512(__m128d __A) {
   return _mm512_castpd128_pd512(__A); 
 }
 
-__m512d test_mm512_set1_epi8(char d)
+__m512i test_mm512_set1_epi8(char d)
 {
   // CHECK-LABEL: @test_mm512_set1_epi8
   // CHECK: insertelement <64 x i8> {{.*}}, i32 0
@@ -6744,7 +8649,7 @@ __m512d test_mm512_set1_epi8(char d)
   return _mm512_set1_epi8(d);
 }
 
-__m512d test_mm512_set1_epi16(short d)
+__m512i test_mm512_set1_epi16(short d)
 {
   // CHECK-LABEL: @test_mm512_set1_epi16
   // CHECK: insertelement <32 x i16> {{.*}}, i32 0
@@ -6801,14 +8706,14 @@ __m512i test_mm512_setr4_epi32(int e0, int e1, int e2, int e3)
   return _mm512_setr4_epi64(e0, e1, e2, e3);
 }
 
-__m512i test_mm512_setr4_pd(double e0, double e1, double e2, double e3)
+__m512d test_mm512_setr4_pd(double e0, double e1, double e2, double e3)
 {
   // CHECK-LABEL: @test_mm512_setr4_pd
   // CHECK: insertelement <8 x double> {{.*}}, i32 7
   return _mm512_setr4_pd(e0,e1,e2,e3);
 }
 
- __m512i test_mm512_setr4_ps(float e0, float e1, float e2, float e3)
+ __m512 test_mm512_setr4_ps(float e0, float e1, float e2, float e3)
 {
   // CHECK-LABEL: @test_mm512_setr4_ps
   // CHECK: insertelement <16 x float> {{.*}}, i32 15
@@ -6960,7 +8865,8 @@ __m128d test_mm_maskz_cvt_roundss_sd( __mmask8 __U, __m128d __A, __m128 __B) {
 
 __m128d test_mm_cvtu32_sd(__m128d __A, unsigned __B) {
   // CHECK-LABEL: @test_mm_cvtu32_sd
-  // CHECK: @llvm.x86.avx512.cvtusi2sd
+  // CHECK: uitofp i32 %{{.*}} to double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvtu32_sd(__A, __B); 
 }
 
@@ -6973,7 +8879,8 @@ __m128d test_mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B) {
 
 __m128d test_mm_cvtu64_sd(__m128d __A, unsigned long long __B) {
   // CHECK-LABEL: @test_mm_cvtu64_sd
-  // CHECK: @llvm.x86.avx512.cvtusi642sd
+  // CHECK: uitofp i64 %{{.*}} to double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvtu64_sd(__A, __B); 
 }
 #endif
@@ -6986,7 +8893,8 @@ __m128 test_mm_cvt_roundu32_ss(__m128 __A, unsigned __B) {
 
 __m128 test_mm_cvtu32_ss(__m128 __A, unsigned __B) {
   // CHECK-LABEL: @test_mm_cvtu32_ss
-  // CHECK: @llvm.x86.avx512.cvtusi2ss
+  // CHECK: uitofp i32 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_cvtu32_ss(__A, __B); 
 }
 
@@ -6999,7 +8907,8 @@ __m128 test_mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B) {
 
 __m128 test_mm_cvtu64_ss(__m128 __A, unsigned long long __B) {
   // CHECK-LABEL: @test_mm_cvtu64_ss
-  // CHECK: @llvm.x86.avx512.cvtusi642ss
+  // CHECK: uitofp i64 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_cvtu64_ss(__A, __B); 
 }
 #endif
@@ -7021,21 +8930,23 @@ __m512i test_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
 __m512 test_mm512_cvtepu32_ps (__m512i __A)
 {
   // CHECK-LABEL: @test_mm512_cvtepu32_ps 
-  // CHECK:  @llvm.x86.avx512.mask.cvtudq2ps.512
+  // CHECK: uitofp <16 x i32> %{{.*}} to <16 x float>
   return _mm512_cvtepu32_ps (__A);
 }
 
 __m512 test_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
 {
   // CHECK-LABEL: @test_mm512_mask_cvtepu32_ps 
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  // CHECK: uitofp <16 x i32> %{{.*}} to <16 x float>
+  // CHECK: select <16 x i1> {{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
   return _mm512_mask_cvtepu32_ps (__W,__U,__A);
 }
 
 __m512 test_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_cvtepu32_ps 
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  // CHECK: uitofp <16 x i32> %{{.*}} to <16 x float>
+  // CHECK: select <16 x i1> {{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
   return _mm512_maskz_cvtepu32_ps (__U,__A);
 }
 
@@ -7082,21 +8993,23 @@ __m512d test_mm512_mask_cvtepi32lo_pd (__m512d __W, __mmask8 __U, __m512i __A)
 __m512 test_mm512_cvtepi32_ps (__m512i __A)
 {
   // CHECK-LABEL: @test_mm512_cvtepi32_ps 
-  // CHECK:  @llvm.x86.avx512.mask.cvtdq2ps.512
+  // CHECK: sitofp <16 x i32> %{{.*}} to <16 x float>
   return _mm512_cvtepi32_ps (__A);
 }
 
 __m512 test_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
 {
   // CHECK-LABEL: @test_mm512_mask_cvtepi32_ps 
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  // CHECK: sitofp <16 x i32> %{{.*}} to <16 x float>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvtepi32_ps (__W,__U,__A);
 }
 
 __m512 test_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_cvtepi32_ps 
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  // CHECK: sitofp <16 x i32> %{{.*}} to <16 x float>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvtepi32_ps (__U,__A);
 }
 
@@ -7332,140 +9245,155 @@ float test_mm512_cvtss_f32(__m512 A) {
 __m512d test_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_pd 
-  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  // CHECK: @llvm.x86.avx512.max.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_max_pd (__W,__U,__A,__B);
 }
 
 __m512d test_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_pd 
-  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  // CHECK: @llvm.x86.avx512.max.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_max_pd (__U,__A,__B);
 }
 
 __m512 test_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_ps 
-  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  // CHECK: @llvm.x86.avx512.max.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_max_ps (__W,__U,__A,__B);
 }
 
 __m512d test_mm512_mask_max_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_round_pd
-  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  // CHECK: @llvm.x86.avx512.max.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_max_round_pd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_maskz_max_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_round_pd
-  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  // CHECK: @llvm.x86.avx512.max.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_max_round_pd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_max_round_pd(__m512d __A,__m512d __B)
 {
   // CHECK-LABEL: @test_mm512_max_round_pd
-  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  // CHECK: @llvm.x86.avx512.max.pd.512
   return _mm512_max_round_pd(__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_ps 
-  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  // CHECK: @llvm.x86.avx512.max.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_max_ps (__U,__A,__B);
 }
 
 __m512 test_mm512_mask_max_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
 {
   // CHECK-LABEL: @test_mm512_mask_max_round_ps
-  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  // CHECK: @llvm.x86.avx512.max.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_max_round_ps(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_maskz_max_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_max_round_ps
-  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  // CHECK: @llvm.x86.avx512.max.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_max_round_ps(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_max_round_ps(__m512 __A,__m512 __B)
 {
   // CHECK-LABEL: @test_mm512_max_round_ps
-  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  // CHECK: @llvm.x86.avx512.max.ps.512
   return _mm512_max_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_pd 
-  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  // CHECK: @llvm.x86.avx512.min.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_min_pd (__W,__U,__A,__B);
 }
 
 __m512d test_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_pd 
-  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  // CHECK: @llvm.x86.avx512.min.pd.512
   return _mm512_maskz_min_pd (__U,__A,__B);
 }
 
 __m512d test_mm512_mask_min_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_round_pd
-  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  // CHECK: @llvm.x86.avx512.min.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_min_round_pd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_maskz_min_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_round_pd
-  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  // CHECK: @llvm.x86.avx512.min.pd.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_min_round_pd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512d test_mm512_min_round_pd( __m512d __A,__m512d __B)
 {
   // CHECK-LABEL: @test_mm512_min_round_pd
-  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  // CHECK: @llvm.x86.avx512.min.pd.512
   return _mm512_min_round_pd(__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_ps 
-  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  // CHECK: @llvm.x86.avx512.min.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_min_ps (__W,__U,__A,__B);
 }
 
 __m512 test_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_ps 
-  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  // CHECK: @llvm.x86.avx512.min.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_min_ps (__U,__A,__B);
 }
 
 __m512 test_mm512_mask_min_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
 {
   // CHECK-LABEL: @test_mm512_mask_min_round_ps
-  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  // CHECK: @llvm.x86.avx512.min.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_min_round_ps(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_maskz_min_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
 {
   // CHECK-LABEL: @test_mm512_maskz_min_round_ps
-  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  // CHECK: @llvm.x86.avx512.min.ps.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_min_round_ps(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
 __m512 test_mm512_min_round_ps(__m512 __A,__m512 __B)
 {
   // CHECK-LABEL: @test_mm512_min_round_ps
-  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  // CHECK: @llvm.x86.avx512.min.ps.512
   return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
@@ -8017,7 +9945,6 @@ __m512i test_mm512_setr_epi32 (int __A, int __B, int __C, int __D,
               __I, __J, __K, __L,__M, __N, __O, __P);
 }
 
-#ifdef __x86_64__
 __m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
 {
   // CHECK-LABEL: @test_mm512_mask_set1_epi64
@@ -8047,7 +9974,6 @@ __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_set1_epi64 (__M, __A);
 }
-#endif
 
 
 __m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C,
@@ -8373,80 +10299,98 @@ int test_mm512_mask2int(__mmask16 __a)
 __m128 test_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
   // CHECK-LABEL: @test_mm_mask_move_ss
-  // CHECK:  extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK:  extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK:  phi float [ %{{.*}}, %{{.*}} ], [ %{{.*}}, %{{.*}} ]
-  // CHECK:  insertelement <4 x float> %{{.*}}, float %cond.i, i32 0
+  // CHECK: [[EXT:%.*]] = extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, float [[A]], float [[B]]
+  // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0
   return _mm_mask_move_ss ( __W,  __U,  __A,  __B);
 }
 
 __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
   // CHECK-LABEL: @test_mm_maskz_move_ss
-  // CHECK:  extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK:  phi float [ %{{.*}}, %{{.*}} ], [ 0.000000e+00, %{{.*}} ]
-  // CHECK:  insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  // CHECK: [[EXT:%.*]] = extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, float [[A]], float [[B]]
+  // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0
   return _mm_maskz_move_ss (__U, __A, __B);
 }
 
 __m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
   // CHECK-LABEL: @test_mm_mask_move_sd
-  // CHECK:  extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK:  extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK:  phi double [ %{{.*}}, %{{.*}} ], [ %{{.*}}, %{{.*}} ]
-  // CHECK:  insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  // CHECK: [[EXT:%.*]] = extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <2 x double> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, double [[A]], double [[B]]
+  // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0
   return _mm_mask_move_sd ( __W,  __U,  __A,  __B);
 }
 
 __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
   // CHECK-LABEL: @test_mm_maskz_move_sd
-  // CHECK:  extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK:  phi double [ %{{.*}}, %{{.*}} ], [ 0.000000e+00, %{{.*}} ]
-  // CHECK:  insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  // CHECK: [[EXT:%.*]] = extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <2 x double> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %13, double [[A]], double [[B]]
+  // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0
   return _mm_maskz_move_sd (__U, __A, __B);
 }
 
 void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A)
 {
   // CHECK-LABEL: @test_mm_mask_store_ss
-  // CHECK: call void @llvm.masked.store.v16f32.p0v16f32(
+  // CHECK: call void @llvm.masked.store.v4f32.p0v4f32(
   _mm_mask_store_ss(__P, __U, __A);
 }
 
 void test_mm_mask_store_sd(double * __P, __mmask8 __U, __m128d __A)
 {
   // CHECK-LABEL: @test_mm_mask_store_sd
-  // CHECK: call void @llvm.masked.store.v8f64.p0v8f64(
+  // CHECK: call void @llvm.masked.store.v2f64.p0v2f64(
   _mm_mask_store_sd(__P, __U, __A);
 }
 
 __m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W)
 {
   // CHECK-LABEL: @test_mm_mask_load_ss
-  // CHECK: call <16 x float> @llvm.masked.load.v16f32.p0v16f32(
+  // CHECK: call <4 x float> @llvm.masked.load.v4f32.p0v4f32(
   return _mm_mask_load_ss(__A, __U, __W);
 }
 
 __m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W)
 {
   // CHECK-LABEL: @test_mm_maskz_load_ss
-  // CHECK: call <16 x float> @llvm.masked.load.v16f32.p0v16f32(
+  // CHECK: call <4 x float> @llvm.masked.load.v4f32.p0v4f32(
   return _mm_maskz_load_ss (__U, __W);
 }
 
 __m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W)
 {
   // CHECK-LABEL: @test_mm_mask_load_sd
-  // CHECK: call <8 x double> @llvm.masked.load.v8f64.p0v8f64(
+  // CHECK: call <2 x double> @llvm.masked.load.v2f64.p0v2f64(
   return _mm_mask_load_sd (__A, __U, __W);
 }
 
 __m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W)
 {
   // CHECK-LABEL: @test_mm_maskz_load_sd
-  // CHECK: call <8 x double> @llvm.masked.load.v8f64.p0v8f64(
+  // CHECK: call <2 x double> @llvm.masked.load.v2f64.p0v2f64(
   return _mm_maskz_load_sd (__U, __W);
 }
 
diff --git a/test/CodeGen/avx512ifma-builtins.c b/test/CodeGen/avx512ifma-builtins.c
index 311d6989bf0d..383cf5350428 100644
--- a/test/CodeGen/avx512ifma-builtins.c
+++ b/test/CodeGen/avx512ifma-builtins.c
@@ -5,36 +5,40 @@
 
 __m512i test_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: @test_mm512_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.512
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.512
   return _mm512_madd52hi_epu64(__X, __Y, __Z); 
 }
 
 __m512i test_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.512
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_madd52hi_epu64(__W, __M, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: @test_mm512_maskz_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.512
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
 }
 
 __m512i test_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: @test_mm512_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.512
   return _mm512_madd52lo_epu64(__X, __Y, __Z); 
 }
 
 __m512i test_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: @test_mm512_mask_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_madd52lo_epu64(__W, __M, __X, __Y); 
 }
 
 __m512i test_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: @test_mm512_maskz_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
 }
diff --git a/test/CodeGen/avx512ifmavl-builtins.c b/test/CodeGen/avx512ifmavl-builtins.c
index 4aeec336ad94..2fe9938eab76 100644
--- a/test/CodeGen/avx512ifmavl-builtins.c
+++ b/test/CodeGen/avx512ifmavl-builtins.c
@@ -4,72 +4,80 @@
 
 __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: @test_mm_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.128
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.128
   return _mm_madd52hi_epu64(__X, __Y, __Z); 
 }
 
 __m128i test_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.128
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_madd52hi_epu64(__W, __M, __X, __Y); 
 }
 
 __m128i test_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: @test_mm_maskz_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.128
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
 }
 
 __m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: @test_mm256_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.256
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.256
   return _mm256_madd52hi_epu64(__X, __Y, __Z); 
 }
 
 __m256i test_mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.256
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_madd52hi_epu64(__W, __M, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: @test_mm256_maskz_madd52hi_epu64
-  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.256
+  // CHECK: @llvm.x86.avx512.vpmadd52h.uq.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
 }
 
 __m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: @test_mm_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.128
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.128
   return _mm_madd52lo_epu64(__X, __Y, __Z); 
 }
 
 __m128i test_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: @test_mm_mask_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.128
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_madd52lo_epu64(__W, __M, __X, __Y); 
 }
 
 __m128i test_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: @test_mm_maskz_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.maskz.vpmadd52l.uq.128
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
 }
 
 __m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: @test_mm256_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.256
   return _mm256_madd52lo_epu64(__X, __Y, __Z); 
 }
 
 __m256i test_mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_madd52lo_epu64(__W, __M, __X, __Y); 
 }
 
 __m256i test_mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: @test_mm256_maskz_madd52lo_epu64
-  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  // CHECK: @llvm.x86.avx512.vpmadd52l.uq.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
 }
diff --git a/test/CodeGen/avx512vbmi-builtins.c b/test/CodeGen/avx512vbmi-builtins.c
index 0816bce3a6de..0347916af09f 100644
--- a/test/CodeGen/avx512vbmi-builtins.c
+++ b/test/CodeGen/avx512vbmi-builtins.c
@@ -5,43 +5,48 @@
 
 __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask2_permutex2var_epi8(__A, __I, __U, __B); 
 }
 
 __m512i test_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.512
   return _mm512_permutex2var_epi8(__A, __I, __B); 
 }
 
 __m512i test_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_permutex2var_epi8(__A, __U, __I, __B); 
 }
 
 __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.512
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); 
 }
 
 __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  // CHECK: @llvm.x86.avx512.permvar.qi.512
   return _mm512_permutexvar_epi8(__A, __B); 
 }
 
 __m512i test_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  // CHECK: @llvm.x86.avx512.permvar.qi.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_permutexvar_epi8(__M, __A, __B); 
 }
 
 __m512i test_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  // CHECK: @llvm.x86.avx512.permvar.qi.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_permutexvar_epi8(__W, __M, __A, __B); 
 }
 
diff --git a/test/CodeGen/avx512vbmi2-builtins.c b/test/CodeGen/avx512vbmi2-builtins.c
index 4da21e39d4e3..db4abdba457f 100644
--- a/test/CodeGen/avx512vbmi2-builtins.c
+++ b/test/CodeGen/avx512vbmi2-builtins.c
@@ -28,13 +28,13 @@ __m512i test_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D) {
 
 void test_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D) {
   // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi16
-  // CHECK: @llvm.x86.avx512.mask.compress.store.w.512
+  // CHECK: @llvm.masked.compressstore.v32i16(<32 x i16> %{{.*}}, i16* %{{.*}}, <32 x i1> %{{.*}})
   _mm512_mask_compressstoreu_epi16(__P, __U, __D);
 }
 
 void test_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D) {
   // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi8
-  // CHECK: @llvm.x86.avx512.mask.compress.store.b.512
+  // CHECK: @llvm.masked.compressstore.v64i8(<64 x i8> %{{.*}}, i8* %{{.*}}, <64 x i1> %{{.*}})
   _mm512_mask_compressstoreu_epi8(__P, __U, __D);
 }
 
@@ -64,133 +64,145 @@ __m512i test_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D) {
 
 __m512i test_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const* __P) {
   // CHECK-LABEL: @test_mm512_mask_expandloadu_epi16
-  // CHECK: @llvm.x86.avx512.mask.expand.load.w.512
+  // CHECK: @llvm.masked.expandload.v32i16(i16* %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_mask_expandloadu_epi16(__S, __U, __P);
 }
 
 __m512i test_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const* __P) {
   // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi16
-  // CHECK: @llvm.x86.avx512.mask.expand.load.w.512
+  // CHECK: @llvm.masked.expandload.v32i16(i16* %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_maskz_expandloadu_epi16(__U, __P);
 }
 
 __m512i test_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const* __P) {
   // CHECK-LABEL: @test_mm512_mask_expandloadu_epi8
-  // CHECK: @llvm.x86.avx512.mask.expand.load.b.512
+  // CHECK: @llvm.masked.expandload.v64i8(i8* %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_mask_expandloadu_epi8(__S, __U, __P);
 }
 
 __m512i test_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const* __P) {
   // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi8
-  // CHECK: @llvm.x86.avx512.mask.expand.load.b.512
+  // CHECK: @llvm.masked.expandload.v64i8(i8* %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_maskz_expandloadu_epi8(__U, __P);
 }
 
 __m512i test_mm512_mask_shldi_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.512
+  // CHECK: @llvm.x86.avx512.vpshld.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_shldi_epi64(__S, __U, __A, __B, 127);
 }
 
 __m512i test_mm512_maskz_shldi_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.512
+  // CHECK: @llvm.x86.avx512.vpshld.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_shldi_epi64(__U, __A, __B, 63);
 }
 
 __m512i test_mm512_shldi_epi64(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.512
+  // CHECK: @llvm.x86.avx512.vpshld.q.512
   return _mm512_shldi_epi64(__A, __B, 31);
 }
 
 __m512i test_mm512_mask_shldi_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.512
+  // CHECK: @llvm.x86.avx512.vpshld.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_shldi_epi32(__S, __U, __A, __B, 127);
 }
 
 __m512i test_mm512_maskz_shldi_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.512
+  // CHECK: @llvm.x86.avx512.vpshld.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_shldi_epi32(__U, __A, __B, 63);
 }
 
 __m512i test_mm512_shldi_epi32(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.512
+  // CHECK: @llvm.x86.avx512.vpshld.d.512
   return _mm512_shldi_epi32(__A, __B, 31);
 }
 
 __m512i test_mm512_mask_shldi_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.512
+  // CHECK: @llvm.x86.avx512.vpshld.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shldi_epi16(__S, __U, __A, __B, 127);
 }
 
 __m512i test_mm512_maskz_shldi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.512
+  // CHECK: @llvm.x86.avx512.vpshld.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shldi_epi16(__U, __A, __B, 63);
 }
 
 __m512i test_mm512_shldi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.512
+  // CHECK: @llvm.x86.avx512.vpshld.w.512
   return _mm512_shldi_epi16(__A, __B, 31);
 }
 
 __m512i test_mm512_mask_shrdi_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.512
+  // CHECK: @llvm.x86.avx512.vpshrd.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_shrdi_epi64(__S, __U, __A, __B, 127);
 }
 
 __m512i test_mm512_maskz_shrdi_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.512
+  // CHECK: @llvm.x86.avx512.vpshrd.q.512
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_shrdi_epi64(__U, __A, __B, 63);
 }
 
 __m512i test_mm512_shrdi_epi64(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.512
+  // CHECK: @llvm.x86.avx512.vpshrd.q.512
   return _mm512_shrdi_epi64(__A, __B, 31);
 }
 
 __m512i test_mm512_mask_shrdi_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.512
+  // CHECK: @llvm.x86.avx512.vpshrd.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_shrdi_epi32(__S, __U, __A, __B, 127);
 }
 
 __m512i test_mm512_maskz_shrdi_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.512
+  // CHECK: @llvm.x86.avx512.vpshrd.d.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_shrdi_epi32(__U, __A, __B, 63);
 }
 
 __m512i test_mm512_shrdi_epi32(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.512
+  // CHECK: @llvm.x86.avx512.vpshrd.d.512
   return _mm512_shrdi_epi32(__A, __B, 31);
 }
 
 __m512i test_mm512_mask_shrdi_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.512
+  // CHECK: @llvm.x86.avx512.vpshrd.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shrdi_epi16(__S, __U, __A, __B, 127);
 }
 
 __m512i test_mm512_maskz_shrdi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.512
+  // CHECK: @llvm.x86.avx512.vpshrd.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shrdi_epi16(__U, __A, __B, 63);
 }
 
 __m512i test_mm512_shrdi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.512
+  // CHECK: @llvm.x86.avx512.vpshrd.w.512
   return _mm512_shrdi_epi16(__A, __B, 31);
 }
 
diff --git a/test/CodeGen/avx512vbmivl-builtin.c b/test/CodeGen/avx512vbmivl-builtin.c
index b114720758aa..da8986f2e7d8 100644
--- a/test/CodeGen/avx512vbmivl-builtin.c
+++ b/test/CodeGen/avx512vbmivl-builtin.c
@@ -5,85 +5,95 @@
 
 __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  // CHECK: @llvm.x86.avx512.permvar.qi.128
   return _mm_permutexvar_epi8(__A, __B); 
 }
 
 __m128i test_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  // CHECK: @llvm.x86.avx512.permvar.qi.128
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_permutexvar_epi8(__M, __A, __B); 
 }
 
 __m128i test_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  // CHECK: @llvm.x86.avx512.permvar.qi.128
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_permutexvar_epi8(__W, __M, __A, __B); 
 }
 
 __m256i test_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  // CHECK: @llvm.x86.avx512.permvar.qi.256
   return _mm256_permutexvar_epi8(__A, __B); 
 }
 
 __m256i test_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  // CHECK: @llvm.x86.avx512.permvar.qi.256
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_permutexvar_epi8(__M, __A, __B); 
 }
 
 __m256i test_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_permutexvar_epi8
-  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  // CHECK: @llvm.x86.avx512.permvar.qi.256
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_permutexvar_epi8(__W, __M, __A, __B); 
 }
 
 __m128i test_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask2_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.128
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask2_permutex2var_epi8(__A, __I, __U, __B); 
 }
 
 __m256i test_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask2_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.256
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask2_permutex2var_epi8(__A, __I, __U, __B); 
 }
 
 __m128i test_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.128
   return _mm_permutex2var_epi8(__A, __I, __B); 
 }
 
 __m128i test_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.128
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_permutex2var_epi8(__A, __U, __I, __B); 
 }
 
 __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.128
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); 
 }
 
 __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.256
   return _mm256_permutex2var_epi8(__A, __I, __B); 
 }
 
 __m256i test_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.256
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_permutex2var_epi8(__A, __U, __I, __B); 
 }
 
 __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutex2var_epi8
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.qi.256
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); 
 }
 
diff --git a/test/CodeGen/avx512vl-builtins.c b/test/CodeGen/avx512vl-builtins.c
index b4fc86da704b..7e4e64381c1d 100644
--- a/test/CodeGen/avx512vl-builtins.c
+++ b/test/CodeGen/avx512vl-builtins.c
@@ -727,14 +727,22 @@ __m128i test_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
 __m256i test_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y) {
   //CHECK-LABEL: @test_mm256_mask_mul_epi32
-  //CHECK: @llvm.x86.avx2.pmul.dq
+  //CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_mul_epi32(__W, __M, __X, __Y);
 }
 
 __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) {
   //CHECK-LABEL: @test_mm256_maskz_mul_epi32
-  //CHECK: @llvm.x86.avx2.pmul.dq
+  //CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_mul_epi32(__M, __X, __Y);
 }
@@ -743,14 +751,22 @@ __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) {
 __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y) {
   //CHECK-LABEL: @test_mm_mask_mul_epi32
-  //CHECK: @llvm.x86.sse41.pmuldq
+  //CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_mul_epi32(__W, __M, __X, __Y);
 }
 
 __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) {
   //CHECK-LABEL: @test_mm_maskz_mul_epi32
-  //CHECK: @llvm.x86.sse41.pmuldq
+  //CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_mul_epi32(__M, __X, __Y);
 }
@@ -758,14 +774,18 @@ __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) {
 __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
            __m256i __Y) {
   //CHECK-LABEL: @test_mm256_mask_mul_epu32
-  //CHECK: @llvm.x86.avx2.pmulu.dq
+  //CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_mul_epu32(__W, __M, __X, __Y);
 }
 
 __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) {
   //CHECK-LABEL: @test_mm256_maskz_mul_epu32
-  //CHECK: @llvm.x86.avx2.pmulu.dq
+  //CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  //CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_mul_epu32(__M, __X, __Y);
 }
@@ -773,14 +793,18 @@ __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) {
 __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
         __m128i __Y) {
   //CHECK-LABEL: @test_mm_mask_mul_epu32
-  //CHECK: @llvm.x86.sse2.pmulu.dq
+  //CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
+  //CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_mul_epu32(__W, __M, __X, __Y);
 }
 
 __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) {
   //CHECK-LABEL: @test_mm_maskz_mul_epu32
-  //CHECK: @llvm.x86.sse2.pmulu.dq
+  //CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
+  //CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
+  //CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_mul_epu32(__M, __X, __Y);
 }
@@ -1047,483 +1071,2417 @@ __m128i test_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   return _mm_maskz_xor_epi64( __U, __A, __B);
 }
 
-__mmask8 test_mm256_cmp_ps_mask(__m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_cmp_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.256
-  return (__mmask8)_mm256_cmp_ps_mask(__A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_eq_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: @test_mm256_cmp_ps_mask_eq_oq
+  // CHECK: fcmp oeq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_EQ_OQ);
 }
 
-__mmask8 test_mm256_mask_cmp_ps_mask(__mmask8 m, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_mask_cmp_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.256
-  return _mm256_mask_cmp_ps_mask(m, __A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_lt_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_lt_os
+  // CHECK: fcmp olt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_LT_OS);
 }
 
-__mmask8 test_mm_cmp_ps_mask(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_cmp_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.128
-  return (__mmask8)_mm_cmp_ps_mask(__A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_le_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_le_os
+  // CHECK: fcmp ole <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_LE_OS);
 }
 
-__mmask8 test_mm_mask_cmp_ps_mask(__mmask8 m, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_cmp_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.ps.128
-  return _mm_mask_cmp_ps_mask(m, __A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_unord_q(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_unord_q
+  // CHECK: fcmp uno <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_UNORD_Q);
 }
 
-__mmask8 test_mm256_cmp_pd_mask(__m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_cmp_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.256
-  return (__mmask8)_mm256_cmp_pd_mask(__A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_neq_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_neq_uq
+  // CHECK: fcmp une <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NEQ_UQ);
 }
 
-__mmask8 test_mm256_mask_cmp_pd_mask(__mmask8 m, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_mask_cmp_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.256
-  return _mm256_mask_cmp_pd_mask(m, __A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_nlt_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_nlt_us
+  // CHECK: fcmp uge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NLT_US);
 }
 
-__mmask8 test_mm_cmp_pd_mask(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_cmp_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.128
-  return (__mmask8)_mm_cmp_pd_mask(__A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_nle_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_nle_us
+  // CHECK: fcmp ugt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NLE_US);
 }
 
-__mmask8 test_mm_mask_cmp_pd_mask(__mmask8 m, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_cmp_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.pd.128
-  return _mm_mask_cmp_pd_mask(m, __A, __B, 0);
+__mmask8 test_mm256_cmp_ps_mask_ord_q(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_ord_q
+  // CHECK: fcmp ord <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_eq_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_eq_uq
+  // CHECK: fcmp ueq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_nge_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_nge_us
+  // CHECK: fcmp ult <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_ngt_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_ngt_us
+  // CHECK: fcmp ule <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_false_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_false_oq
+  // CHECK: fcmp false <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_neq_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_neq_oq
+  // CHECK: fcmp one <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_ge_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_ge_os
+  // CHECK: fcmp oge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_gt_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_gt_os
+  // CHECK: fcmp ogt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_true_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_true_uq
+  // CHECK: fcmp true <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_eq_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_eq_os
+  // CHECK: fcmp oeq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_lt_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_lt_oq
+  // CHECK: fcmp olt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_le_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_le_oq
+  // CHECK: fcmp ole <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_unord_s(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_unord_s
+  // CHECK: fcmp uno <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_neq_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_neq_us
+  // CHECK: fcmp une <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_nlt_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_nlt_uq
+  // CHECK: fcmp uge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_nle_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_nle_uq
+  // CHECK: fcmp ugt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_ord_s(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_ord_s
+  // CHECK: fcmp ord <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_eq_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_eq_us
+  // CHECK: fcmp ueq <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_nge_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_nge_uq
+  // CHECK: fcmp ult <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_ngt_uq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_ngt_uq
+  // CHECK: fcmp ule <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_false_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_false_os
+  // CHECK: fcmp false <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_neq_os(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_neq_os
+  // CHECK: fcmp one <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_ge_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_ge_oq
+  // CHECK: fcmp oge <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_gt_oq(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_gt_oq
+  // CHECK: fcmp ogt <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm256_cmp_ps_mask_true_us(__m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_cmp_ps_mask_true_us
+  // CHECK: fcmp true <8 x float> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ps_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_eq_oq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_ps_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_lt_os(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_le_os(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_unord_q(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_neq_uq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_nlt_us(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_nle_us(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_ord_q(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_eq_uq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_nge_us(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_ngt_us(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_false_oq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_neq_oq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_ge_os(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_gt_os(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_true_uq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_eq_os(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_lt_oq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_le_oq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_unord_s(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_neq_us(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_nlt_uq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_nle_uq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_ord_s(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_eq_us(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_nge_uq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_ngt_uq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_false_os(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_neq_os(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_ge_oq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_gt_oq(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_ps_mask_true_us(__mmask8 m, __m256 a, __m256 b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ps_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <8 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_eq_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: @test_mm256_cmp_pd_mask_eq_oq
+  // CHECK: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_lt_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_lt_os
+  // CHECK: fcmp olt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_le_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_le_os
+  // CHECK: fcmp ole <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_unord_q(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_unord_q
+  // CHECK: fcmp uno <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_neq_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_neq_uq
+  // CHECK: fcmp une <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_nlt_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_nlt_us
+  // CHECK: fcmp uge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_nle_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_nle_us
+  // CHECK: fcmp ugt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_ord_q(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_ord_q
+  // CHECK: fcmp ord <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_eq_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_eq_uq
+  // CHECK: fcmp ueq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_nge_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_nge_us
+  // CHECK: fcmp ult <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_ngt_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_ngt_us
+  // CHECK: fcmp ule <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_false_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_false_oq
+  // CHECK: fcmp false <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_neq_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_neq_oq
+  // CHECK: fcmp one <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_ge_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_ge_os
+  // CHECK: fcmp oge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_gt_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_gt_os
+  // CHECK: fcmp ogt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_true_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_true_uq
+  // CHECK: fcmp true <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_eq_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_eq_os
+  // CHECK: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_lt_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_lt_oq
+  // CHECK: fcmp olt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_le_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_le_oq
+  // CHECK: fcmp ole <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_unord_s(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_unord_s
+  // CHECK: fcmp uno <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_neq_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_neq_us
+  // CHECK: fcmp une <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_nlt_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_nlt_uq
+  // CHECK: fcmp uge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_nle_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_nle_uq
+  // CHECK: fcmp ugt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_ord_s(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_ord_s
+  // CHECK: fcmp ord <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_eq_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_eq_us
+  // CHECK: fcmp ueq <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_nge_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_nge_uq
+  // CHECK: fcmp ult <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_ngt_uq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_ngt_uq
+  // CHECK: fcmp ule <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_false_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_false_os
+  // CHECK: fcmp false <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_neq_os(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_neq_os
+  // CHECK: fcmp one <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_ge_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_ge_oq
+  // CHECK: fcmp oge <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_gt_oq(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_gt_oq
+  // CHECK: fcmp ogt <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm256_cmp_pd_mask_true_us(__m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_cmp_pd_mask_true_us
+  // CHECK: fcmp true <4 x double> %{{.*}}, %{{.*}}
+  return _mm256_cmp_pd_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_eq_oq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_pd_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_lt_os(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_le_os(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_unord_q(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_neq_uq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_nlt_us(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_nle_us(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_ord_q(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_eq_uq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_nge_us(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_ngt_us(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_false_oq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_neq_oq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_ge_os(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_gt_os(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_true_uq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_eq_os(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_lt_oq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_le_oq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_unord_s(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_neq_us(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_nlt_uq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_nle_uq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_ord_s(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_eq_us(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_nge_uq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_ngt_uq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_false_os(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_neq_os(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_ge_oq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_gt_oq(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm256_mask_cmp_pd_mask_true_us(__mmask8 m, __m256d a, __m256d b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_pd_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <4 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_cmp_ps_mask_eq_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: @test_mm_cmp_ps_mask_eq_oq
+  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_lt_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_lt_os
+  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_cmp_ps_mask_le_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_le_os
+  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_cmp_ps_mask_unord_q(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_unord_q
+  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_cmp_ps_mask_neq_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_neq_uq
+  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_nlt_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_nlt_us
+  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_cmp_ps_mask_nle_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_nle_us
+  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_cmp_ps_mask_ord_q(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_ord_q
+  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_cmp_ps_mask_eq_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_eq_uq
+  // CHECK: fcmp ueq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_nge_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_nge_us
+  // CHECK: fcmp ult <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_cmp_ps_mask_ngt_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_ngt_us
+  // CHECK: fcmp ule <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_cmp_ps_mask_false_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_false_oq
+  // CHECK: fcmp false <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_neq_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_neq_oq
+  // CHECK: fcmp one <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_ge_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_ge_os
+  // CHECK: fcmp oge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_cmp_ps_mask_gt_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_gt_os
+  // CHECK: fcmp ogt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_cmp_ps_mask_true_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_true_uq
+  // CHECK: fcmp true <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_eq_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_eq_os
+  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_cmp_ps_mask_lt_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_lt_oq
+  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_le_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_le_oq
+  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_unord_s(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_unord_s
+  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_cmp_ps_mask_neq_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_neq_us
+  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_cmp_ps_mask_nlt_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_nlt_uq
+  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_nle_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_nle_uq
+  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_ord_s(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_ord_s
+  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_cmp_ps_mask_eq_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_eq_us
+  // CHECK: fcmp ueq <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_cmp_ps_mask_nge_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_nge_uq
+  // CHECK: fcmp ult <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_ngt_uq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_ngt_uq
+  // CHECK: fcmp ule <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_false_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_false_os
+  // CHECK: fcmp false <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_cmp_ps_mask_neq_os(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_neq_os
+  // CHECK: fcmp one <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_cmp_ps_mask_ge_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_ge_oq
+  // CHECK: fcmp oge <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_gt_oq(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_gt_oq
+  // CHECK: fcmp ogt <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_cmp_ps_mask_true_us(__m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_cmp_ps_mask_true_us
+  // CHECK: fcmp true <4 x float> %{{.*}}, %{{.*}}
+  return _mm_cmp_ps_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_eq_oq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_ps_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_lt_os(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_le_os(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_unord_q(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_neq_uq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_nlt_us(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_nle_us(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_ord_q(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_eq_uq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_nge_us(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_ngt_us(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_false_oq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_neq_oq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_ge_os(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_gt_os(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_true_uq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_eq_os(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_lt_oq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_le_oq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_unord_s(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_neq_us(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_nlt_uq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_nle_uq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_ord_s(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_eq_us(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_nge_uq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_ngt_uq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_false_os(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_neq_os(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_ge_oq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_gt_oq(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ps_mask_true_us(__mmask8 m, __m128 a, __m128 b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ps_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <4 x float> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_cmp_pd_mask_eq_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: @test_mm_cmp_pd_mask_eq_oq
+  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_lt_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_lt_os
+  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_cmp_pd_mask_le_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_le_os
+  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_cmp_pd_mask_unord_q(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_unord_q
+  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_cmp_pd_mask_neq_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_neq_uq
+  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_nlt_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_nlt_us
+  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_cmp_pd_mask_nle_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_nle_us
+  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_cmp_pd_mask_ord_q(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_ord_q
+  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_cmp_pd_mask_eq_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_eq_uq
+  // CHECK: fcmp ueq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_nge_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_nge_us
+  // CHECK: fcmp ult <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_cmp_pd_mask_ngt_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_ngt_us
+  // CHECK: fcmp ule <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_cmp_pd_mask_false_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_false_oq
+  // CHECK: fcmp false <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_neq_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_neq_oq
+  // CHECK: fcmp one <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_ge_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_ge_os
+  // CHECK: fcmp oge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_cmp_pd_mask_gt_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_gt_os
+  // CHECK: fcmp ogt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_cmp_pd_mask_true_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_true_uq
+  // CHECK: fcmp true <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_eq_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_eq_os
+  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_cmp_pd_mask_lt_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_lt_oq
+  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_le_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_le_oq
+  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_unord_s(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_unord_s
+  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_cmp_pd_mask_neq_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_neq_us
+  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_cmp_pd_mask_nlt_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_nlt_uq
+  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_nle_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_nle_uq
+  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_ord_s(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_ord_s
+  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_cmp_pd_mask_eq_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_eq_us
+  // CHECK: fcmp ueq <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_cmp_pd_mask_nge_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_nge_uq
+  // CHECK: fcmp ult <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_ngt_uq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_ngt_uq
+  // CHECK: fcmp ule <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_false_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_false_os
+  // CHECK: fcmp false <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_cmp_pd_mask_neq_os(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_neq_os
+  // CHECK: fcmp one <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_cmp_pd_mask_ge_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_ge_oq
+  // CHECK: fcmp oge <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_gt_oq(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_gt_oq
+  // CHECK: fcmp ogt <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_cmp_pd_mask_true_us(__m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_cmp_pd_mask_true_us
+  // CHECK: fcmp true <2 x double> %{{.*}}, %{{.*}}
+  return _mm_cmp_pd_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_eq_oq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_pd_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_lt_os(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_le_os(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_unord_q(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_neq_uq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_nlt_us(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_nle_us(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_ord_q(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_eq_uq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_nge_us(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_ngt_us(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_false_oq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_neq_oq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_ge_os(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_gt_os(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_true_uq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_eq_os(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_lt_oq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_le_oq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_unord_s(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_neq_us(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_nlt_uq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_nle_uq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_ord_s(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_eq_us(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_nge_uq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_ngt_uq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_false_os(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_neq_os(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_ge_oq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_gt_oq(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_pd_mask_true_us(__mmask8 m, __m128d a, __m128d b) {
+  // CHECK-LABEL: test_mm_mask_cmp_pd_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <2 x double> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_US);
 }
 
 __m128d test_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.128
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_fmadd_pd(__A, __U, __B, __C);
 }
 
 __m128d test_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_fmsub_pd(__A, __U, __B, __C);
 }
 
 __m128d test_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.128
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask3_fmadd_pd(__A, __B, __C, __U);
 }
 
 __m128d test_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask3_fnmadd_pd(__A, __B, __C, __U);
 }
 
 __m128d test_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_maskz_fmadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_fmadd_pd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_maskz_fmsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_fmsub_pd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_maskz_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_fnmadd_pd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_maskz_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_fnmsub_pd(__U, __A, __B, __C);
 }
 
 __m256d test_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_mask_fmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.256
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_fmadd_pd(__A, __U, __B, __C);
 }
 
 __m256d test_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_mask_fmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_fmsub_pd(__A, __U, __B, __C);
 }
 
 __m256d test_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.256
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask3_fmadd_pd(__A, __B, __C, __U);
 }
 
 __m256d test_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask3_fnmadd_pd(__A, __B, __C, __U);
 }
 
 __m256d test_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_fmadd_pd(__U, __A, __B, __C);
 }
 
 __m256d test_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_fmsub_pd(__U, __A, __B, __C);
 }
 
 __m256d test_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_maskz_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_fnmadd_pd(__U, __A, __B, __C);
 }
 
 __m256d test_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_maskz_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_fnmsub_pd(__U, __A, __B, __C);
 }
 
 __m128 test_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_mask_fmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.128
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fmadd_ps(__A, __U, __B, __C);
 }
 
 __m128 test_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_mask_fmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fmsub_ps(__A, __U, __B, __C);
 }
 
 __m128 test_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.128
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask3_fmadd_ps(__A, __B, __C, __U);
 }
 
 __m128 test_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask3_fnmadd_ps(__A, __B, __C, __U);
 }
 
 __m128 test_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_maskz_fmadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_fmadd_ps(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_maskz_fmsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_fmsub_ps(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_maskz_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_fnmadd_ps(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_maskz_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_fnmsub_ps(__U, __A, __B, __C);
 }
 
 __m256 test_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_mask_fmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.256
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_fmadd_ps(__A, __U, __B, __C);
 }
 
 __m256 test_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_mask_fmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmadd.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_fmsub_ps(__A, __U, __B, __C);
 }
 
 __m256 test_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.256
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask3_fmadd_ps(__A, __B, __C, __U);
 }
 
 __m256 test_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask3_fnmadd_ps(__A, __B, __C, __U);
 }
 
 __m256 test_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_fmadd_ps(__U, __A, __B, __C);
 }
 
 __m256 test_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_fmsub_ps(__U, __A, __B, __C);
 }
 
 __m256 test_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_maskz_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_fnmadd_ps(__U, __A, __B, __C);
 }
 
 __m256 test_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_maskz_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_fnmsub_ps(__U, __A, __B, __C);
 }
 
 __m128d test_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.128
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: shufflevector <2 x double> [[SUB]], <2 x double> [[ADD]], <2 x i32> <i32 0, i32 3>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_fmaddsub_pd(__A, __U, __B, __C);
 }
 
 __m128d test_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.128
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: shufflevector <2 x double> [[ADD]], <2 x double> [[SUB]], <2 x i32> <i32 0, i32 3>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_fmsubadd_pd(__A, __U, __B, __C);
 }
 
 __m128d test_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.128
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: shufflevector <2 x double> [[SUB]], <2 x double> [[ADD]], <2 x i32> <i32 0, i32 3>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask3_fmaddsub_pd(__A, __B, __C, __U);
 }
 
 __m128d test_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_maskz_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.128
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: shufflevector <2 x double> [[SUB]], <2 x double> [[ADD]], <2 x i32> <i32 0, i32 3>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_fmaddsub_pd(__U, __A, __B, __C);
 }
 
 __m128d test_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_maskz_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.128
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: shufflevector <2 x double> [[ADD]], <2 x double> [[SUB]], <2 x i32> <i32 0, i32 3>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_fmsubadd_pd(__U, __A, __B, __C);
 }
 
 __m256d test_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_mask_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.256
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[SUB]], <4 x double> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_fmaddsub_pd(__A, __U, __B, __C);
 }
 
 __m256d test_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_mask_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.pd.256
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[ADD]], <4 x double> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_fmsubadd_pd(__A, __U, __B, __C);
 }
 
 __m256d test_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.pd.256
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[SUB]], <4 x double> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask3_fmaddsub_pd(__A, __B, __C, __U);
 }
 
 __m256d test_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmaddsub_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.256
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[SUB]], <4 x double> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_fmaddsub_pd(__U, __A, __B, __C);
 }
 
 __m256d test_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.pd.256
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[ADD]], <4 x double> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_fmsubadd_pd(__U, __A, __B, __C);
 }
 
 __m128 test_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_mask_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.128
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: shufflevector <4 x float> [[SUB]], <4 x float> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fmaddsub_ps(__A, __U, __B, __C);
 }
 
 __m128 test_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_mask_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.128
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: shufflevector <4 x float> [[ADD]], <4 x float> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fmsubadd_ps(__A, __U, __B, __C);
 }
 
 __m128 test_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.128
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: shufflevector <4 x float> [[SUB]], <4 x float> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask3_fmaddsub_ps(__A, __B, __C, __U);
 }
 
 __m128 test_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_maskz_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.128
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: shufflevector <4 x float> [[SUB]], <4 x float> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_fmaddsub_ps(__U, __A, __B, __C);
 }
 
 __m128 test_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_maskz_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.128
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: shufflevector <4 x float> [[ADD]], <4 x float> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_fmsubadd_ps(__U, __A, __B, __C);
 }
 
 __m256 test_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_mask_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.256
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: shufflevector <8 x float> [[SUB]], <8 x float> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_fmaddsub_ps(__A, __U, __B, __C);
 }
 
 __m256 test_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_mask_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfmaddsub.ps.256
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: shufflevector <8 x float> [[ADD]], <8 x float> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_fmsubadd_ps(__A, __U, __B, __C);
 }
 
 __m256 test_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmaddsub.ps.256
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: shufflevector <8 x float> [[SUB]], <8 x float> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask3_fmaddsub_ps(__A, __B, __C, __U);
 }
 
 __m256 test_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmaddsub_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.256
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: shufflevector <8 x float> [[SUB]], <8 x float> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_fmaddsub_ps(__U, __A, __B, __C);
 }
 
 __m256 test_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_maskz_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.maskz.vfmaddsub.ps.256
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: shufflevector <8 x float> [[ADD]], <8 x float> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_fmsubadd_ps(__U, __A, __B, __C);
 }
 
 __m128d test_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask3_fmsub_pd(__A, __B, __C, __U);
 }
 
 __m256d test_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask3_fmsub_pd(__A, __B, __C, __U);
 }
 
 __m128 test_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask3_fmsub_ps(__A, __B, __C, __U);
 }
 
 __m256 test_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsub.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask3_fmsub_ps(__A, __B, __C, __U);
 }
 
 __m128d test_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.128
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: shufflevector <2 x double> [[ADD]], <2 x double> [[SUB]], <2 x i32> <i32 0, i32 3>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask3_fmsubadd_pd(__A, __B, __C, __U);
 }
 
 __m256d test_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmsubadd_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.pd.256
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[ADD]], <4 x double> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask3_fmsubadd_pd(__A, __B, __C, __U);
 }
 
 __m128 test_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.128
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: shufflevector <4 x float> [[ADD]], <4 x float> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask3_fmsubadd_ps(__A, __B, __C, __U);
 }
 
 __m256 test_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fmsubadd_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfmsubadd.ps.256
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: shufflevector <8 x float> [[ADD]], <8 x float> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask3_fmsubadd_ps(__A, __B, __C, __U);
 }
 
 __m128d test_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_fnmadd_pd(__A, __U, __B, __C);
 }
 
 __m256d test_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_mask_fnmadd_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_fnmadd_pd(__A, __U, __B, __C);
 }
 
 __m128 test_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_mask_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fnmadd_ps(__A, __U, __B, __C);
 }
 
 __m256 test_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_mask_fnmadd_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmadd.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_fnmadd_ps(__A, __U, __B, __C);
 }
 
 __m128d test_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_fnmsub_pd(__A, __U, __B, __C);
 }
 
 __m128d test_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.128
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask3_fnmsub_pd(__A, __B, __C, __U);
 }
 
 __m256d test_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) {
   // CHECK-LABEL: @test_mm256_mask_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_fnmsub_pd(__A, __U, __B, __C);
 }
 
 __m256d test_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fnmsub_pd
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.pd.256
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask3_fnmsub_pd(__A, __B, __C, __U);
 }
 
 __m128 test_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) {
   // CHECK-LABEL: @test_mm_mask_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fnmsub_ps(__A, __U, __B, __C);
 }
 
 __m128 test_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm_mask3_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.128
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask3_fnmsub_ps(__A, __B, __C, __U);
 }
 
 __m256 test_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) {
   // CHECK-LABEL: @test_mm256_mask_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.mask.vfnmsub.ps.256
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_fnmsub_ps(__A, __U, __B, __C);
 }
 
 __m256 test_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) {
   // CHECK-LABEL: @test_mm256_mask3_fnmsub_ps
-  // CHECK: @llvm.x86.avx512.mask3.vfnmsub.ps.256 
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask3_fnmsub_ps(__A, __B, __C, __U);
 }
 
@@ -1557,7 +3515,7 @@ __m128 test_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_add_ps(__W,__U,__A,__B); 
 }
-__m128 test_mm_maskz_add_ps(__mmask16 __U, __m128 __A, __m128 __B) {
+__m128 test_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_add_ps
   // CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
@@ -1697,42 +3655,42 @@ __m256i test_mm256_maskz_compress_epi32(__mmask8 __U, __m256i __A) {
 }
 void test_mm_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: @test_mm_mask_compressstoreu_pd
-  // CHECK: @llvm.x86.avx512.mask.compress.store.pd.128
+  // CHECK: @llvm.masked.compressstore.v2f64(<2 x double> %{{.*}}, double* %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_compressstoreu_pd(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_compressstoreu_pd
-  // CHECK: @llvm.x86.avx512.mask.compress.store.pd.256
+  // CHECK: @llvm.masked.compressstore.v4f64(<4 x double> %{{.*}}, double* %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_pd(__P,__U,__A); 
 }
 void test_mm_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_compressstoreu_epi64
-  // CHECK: @llvm.x86.avx512.mask.compress.store.q.128
+  // CHECK: @llvm.masked.compressstore.v2i64(<2 x i64> %{{.*}}, i64* %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_compressstoreu_epi64(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_compressstoreu_epi64
-  // CHECK: @llvm.x86.avx512.mask.compress.store.q.256
+  // CHECK: @llvm.masked.compressstore.v4i64(<4 x i64> %{{.*}}, i64* %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_epi64(__P,__U,__A); 
 }
 void test_mm_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_mask_compressstoreu_ps
-  // CHECK: @llvm.x86.avx512.mask.compress.store.ps.128
+  // CHECK: @llvm.masked.compressstore.v4f32(<4 x float> %{{.*}}, float* %{{.*}}, <4 x i1> %{{.*}})
   return _mm_mask_compressstoreu_ps(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_compressstoreu_ps
-  // CHECK: @llvm.x86.avx512.mask.compress.store.ps.256
+  // CHECK: @llvm.masked.compressstore.v8f32(<8 x float> %{{.*}}, float* %{{.*}}, <8 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_ps(__P,__U,__A); 
 }
 void test_mm_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_compressstoreu_epi32
-  // CHECK: @llvm.x86.avx512.mask.compress.store.d.128
+  // CHECK: @llvm.masked.compressstore.v4i32(<4 x i32> %{{.*}}, i32* %{{.*}}, <4 x i1> %{{.*}})
   return _mm_mask_compressstoreu_epi32(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_compressstoreu_epi32
-  // CHECK: @llvm.x86.avx512.mask.compress.store.d.256
+  // CHECK: @llvm.masked.compressstore.v8i32(<8 x i32> %{{.*}}, i32* %{{.*}}, <8 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_epi32(__P,__U,__A); 
 }
 __m128d test_mm_mask_cvtepi32_pd(__m128d __W, __mmask8 __U, __m128i __A) {
@@ -1763,22 +3721,26 @@ __m256d test_mm256_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) {
 }
 __m128 test_mm_mask_cvtepi32_ps(__m128 __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.128
+  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x float>
+  // CHECK: select <4 x i1> {{.*}}, <4 x float> {{.*}}, <4 x float> {{.*}}
   return _mm_mask_cvtepi32_ps(__W,__U,__A); 
 }
-__m128 test_mm_maskz_cvtepi32_ps(__mmask16 __U, __m128i __A) {
+__m128 test_mm_maskz_cvtepi32_ps(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.128
+  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x float>
+  // CHECK: select <4 x i1> {{.*}}, <4 x float> {{.*}}, <4 x float> {{.*}}
   return _mm_maskz_cvtepi32_ps(__U,__A); 
 }
 __m256 test_mm256_mask_cvtepi32_ps(__m256 __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.256
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x float>
+  // CHECK: select <8 x i1> {{.*}}, <8 x float> {{.*}}, <8 x float> {{.*}}
   return _mm256_mask_cvtepi32_ps(__W,__U,__A); 
 }
-__m256 test_mm256_maskz_cvtepi32_ps(__mmask16 __U, __m256i __A) {
+__m256 test_mm256_maskz_cvtepi32_ps(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.256
+  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x float>
+  // CHECK: select <8 x i1> {{.*}}, <8 x float> {{.*}}, <8 x float> {{.*}}
   return _mm256_maskz_cvtepi32_ps(__U,__A); 
 }
 __m128i test_mm_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) {
@@ -1793,12 +3755,14 @@ __m128i test_mm_maskz_cvtpd_epi32(__mmask8 __U, __m128d __A) {
 }
 __m128i test_mm256_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.256
+  // CHECK: @llvm.x86.avx.cvt.pd2dq.256
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm256_mask_cvtpd_epi32(__W,__U,__A); 
 }
 __m128i test_mm256_maskz_cvtpd_epi32(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.256
+  // CHECK: @llvm.x86.avx.cvt.pd2dq.256
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm256_maskz_cvtpd_epi32(__U,__A); 
 }
 __m128 test_mm_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m128d __A) {
@@ -1813,12 +3777,14 @@ __m128 test_mm_maskz_cvtpd_ps(__mmask8 __U, __m128d __A) {
 }
 __m128 test_mm256_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtpd_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.256
+  // CHECK: @llvm.x86.avx.cvt.pd2.ps.256
+  // CHECK: select <4 x i1> {{.*}}, <4 x float> {{.*}}, <4 x float> {{.*}}
   return _mm256_mask_cvtpd_ps(__W,__U,__A); 
 }
 __m128 test_mm256_maskz_cvtpd_ps(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtpd_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.256
+  // CHECK: @llvm.x86.avx.cvt.pd2.ps.256
+  // CHECK: select <4 x i1> {{.*}}, <4 x float> {{.*}}, <4 x float> {{.*}}
   return _mm256_maskz_cvtpd_ps(__U,__A); 
 }
 __m128i test_mm_cvtpd_epu32(__m128d __A) {
@@ -1853,42 +3819,50 @@ __m128i test_mm256_maskz_cvtpd_epu32(__mmask8 __U, __m256d __A) {
 }
 __m128i test_mm_mask_cvtps_epi32(__m128i __W, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_mask_cvtps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.128
+  // CHECK: @llvm.x86.sse2.cvtps2dq
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm_mask_cvtps_epi32(__W,__U,__A); 
 }
 __m128i test_mm_maskz_cvtps_epi32(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.128
+  // CHECK: @llvm.x86.sse2.cvtps2dq
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm_maskz_cvtps_epi32(__U,__A); 
 }
 __m256i test_mm256_mask_cvtps_epi32(__m256i __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.256
+  // CHECK: @llvm.x86.avx.cvt.ps2dq.256
+  // CHECK: select <8 x i1> {{.*}}, <8 x i32> {{.*}}, <8 x i32> {{.*}}
   return _mm256_mask_cvtps_epi32(__W,__U,__A); 
 }
 __m256i test_mm256_maskz_cvtps_epi32(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.256
+  // CHECK: @llvm.x86.avx.cvt.ps2dq.256
+  // CHECK: select <8 x i1> {{.*}}, <8 x i32> {{.*}}, <8 x i32> {{.*}}
   return _mm256_maskz_cvtps_epi32(__U,__A); 
 }
 __m128d test_mm_mask_cvtps_pd(__m128d __W, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_mask_cvtps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.128
+  // CHECK: fpext <2 x float> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> {{.*}}, <2 x double> {{.*}}, <2 x double> {{.*}}
   return _mm_mask_cvtps_pd(__W,__U,__A); 
 }
 __m128d test_mm_maskz_cvtps_pd(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.128
+  // CHECK: fpext <2 x float> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> {{.*}}, <2 x double> {{.*}}, <2 x double> {{.*}}
   return _mm_maskz_cvtps_pd(__U,__A); 
 }
 __m256d test_mm256_mask_cvtps_pd(__m256d __W, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.256
+  // CHECK: fpext <4 x float> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> {{.*}}, <4 x double> {{.*}}, <4 x double> {{.*}}
   return _mm256_mask_cvtps_pd(__W,__U,__A); 
 }
 __m256d test_mm256_maskz_cvtps_pd(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.256
+  // CHECK: fpext <4 x float> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> {{.*}}, <4 x double> {{.*}}, <4 x double> {{.*}}
   return _mm256_maskz_cvtps_pd(__U,__A); 
 }
 __m128i test_mm_cvtps_epu32(__m128 __A) {
@@ -1933,12 +3907,14 @@ __m128i test_mm_maskz_cvttpd_epi32(__mmask8 __U, __m128d __A) {
 }
 __m128i test_mm256_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_cvttpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.256
+  // CHECK: @llvm.x86.avx.cvtt.pd2dq.256
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm256_mask_cvttpd_epi32(__W,__U,__A); 
 }
 __m128i test_mm256_maskz_cvttpd_epi32(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvttpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.256
+  // CHECK: @llvm.x86.avx.cvtt.pd2dq.256
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm256_maskz_cvttpd_epi32(__U,__A); 
 }
 __m128i test_mm_cvttpd_epu32(__m128d __A) {
@@ -1973,22 +3949,26 @@ __m128i test_mm256_maskz_cvttpd_epu32(__mmask8 __U, __m256d __A) {
 }
 __m128i test_mm_mask_cvttps_epi32(__m128i __W, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_mask_cvttps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.128
+  // CHECK: @llvm.x86.sse2.cvttps2dq
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm_mask_cvttps_epi32(__W,__U,__A); 
 }
 __m128i test_mm_maskz_cvttps_epi32(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_maskz_cvttps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.128
+  // CHECK: @llvm.x86.sse2.cvttps2dq
+  // CHECK: select <4 x i1> {{.*}}, <4 x i32> {{.*}}, <4 x i32> {{.*}}
   return _mm_maskz_cvttps_epi32(__U,__A); 
 }
 __m256i test_mm256_mask_cvttps_epi32(__m256i __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_cvttps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.256
+  // CHECK: @llvm.x86.avx.cvtt.ps2dq.256
+  // CHECK: select <8 x i1> {{.*}}, <8 x i32> {{.*}}, <8 x i32> {{.*}}
   return _mm256_mask_cvttps_epi32(__W,__U,__A); 
 }
 __m256i test_mm256_maskz_cvttps_epi32(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvttps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.256
+  // CHECK: @llvm.x86.avx.cvtt.ps2dq.256
+  // CHECK: select <8 x i1> {{.*}}, <8 x i32> {{.*}}, <8 x i32> {{.*}}
   return _mm256_maskz_cvttps_epi32(__U,__A); 
 }
 __m128i test_mm_cvttps_epu32(__m128 __A) {
@@ -2060,32 +4040,36 @@ __m256d test_mm256_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) {
 }
 __m128 test_mm_cvtepu32_ps(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepu32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.128
+  // CHECK: uitofp <4 x i32> %{{.*}} to <4 x float>
   return _mm_cvtepu32_ps(__A); 
 }
 __m128 test_mm_mask_cvtepu32_ps(__m128 __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.128
+  // CHECK: uitofp <4 x i32> %{{.*}} to <4 x float>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_cvtepu32_ps(__W,__U,__A); 
 }
 __m128 test_mm_maskz_cvtepu32_ps(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.128
+  // CHECK: uitofp <4 x i32> %{{.*}} to <4 x float>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_cvtepu32_ps(__U,__A); 
 }
 __m256 test_mm256_cvtepu32_ps(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepu32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.256
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x float>
   return _mm256_cvtepu32_ps(__A); 
 }
 __m256 test_mm256_mask_cvtepu32_ps(__m256 __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.256
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x float>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_cvtepu32_ps(__W,__U,__A); 
 }
 __m256 test_mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu32_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.256
+  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x float>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_cvtepu32_ps(__U,__A); 
 }
 __m128d test_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
@@ -2118,7 +4102,7 @@ __m128 test_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_div_ps(__W,__U,__A,__B); 
 }
-__m128 test_mm_maskz_div_ps(__mmask16 __U, __m128 __A, __m128 __B) {
+__m128 test_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_div_ps
   // CHECK: fdiv <4 x float> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
@@ -2178,82 +4162,82 @@ __m256i test_mm256_maskz_expand_epi64(__mmask8 __U, __m256i __A) {
 }
 __m128d test_mm_mask_expandloadu_pd(__m128d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_mask_expandloadu_pd
-  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.128
+  // CHECK: @llvm.masked.expandload.v2f64(double* %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_mask_expandloadu_pd(__W,__U,__P); 
 }
 __m128d test_mm_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_maskz_expandloadu_pd
-  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.128
+  // CHECK: @llvm.masked.expandload.v2f64(double* %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_maskz_expandloadu_pd(__U,__P); 
 }
 __m256d test_mm256_mask_expandloadu_pd(__m256d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm256_mask_expandloadu_pd
-  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.256
+  // CHECK: @llvm.masked.expandload.v4f64(double* %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_mask_expandloadu_pd(__W,__U,__P); 
 }
 __m256d test_mm256_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm256_maskz_expandloadu_pd
-  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.256
+  // CHECK: @llvm.masked.expandload.v4f64(double* %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_maskz_expandloadu_pd(__U,__P); 
 }
 __m128i test_mm_mask_expandloadu_epi64(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_mask_expandloadu_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand.load.q.128
+  // CHECK: @llvm.masked.expandload.v2i64(i64* %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_mask_expandloadu_epi64(__W,__U,__P); 
 }
 __m128i test_mm_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_maskz_expandloadu_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand.load.q.128
+  // CHECK: @llvm.masked.expandload.v2i64(i64* %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maskz_expandloadu_epi64(__U,__P); 
 }
 __m256i test_mm256_mask_expandloadu_epi64(__m256i __W, __mmask8 __U,   void const *__P) {
   // CHECK-LABEL: @test_mm256_mask_expandloadu_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand.load.q.256
+  // CHECK: @llvm.masked.expandload.v4i64(i64* %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_mask_expandloadu_epi64(__W,__U,__P); 
 }
 __m256i test_mm256_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm256_maskz_expandloadu_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand.load.q.256
+  // CHECK: @llvm.masked.expandload.v4i64(i64* %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_maskz_expandloadu_epi64(__U,__P); 
 }
 __m128 test_mm_mask_expandloadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_mask_expandloadu_ps
-  // CHECK: @llvm.x86.avx512.mask.expand.load.ps.128
+  // CHECK: @llvm.masked.expandload.v4f32(float* %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_mask_expandloadu_ps(__W,__U,__P); 
 }
 __m128 test_mm_maskz_expandloadu_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_maskz_expandloadu_ps
-  // CHECK: @llvm.x86.avx512.mask.expand.load.ps.128
+  // CHECK: @llvm.masked.expandload.v4f32(float* %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_maskz_expandloadu_ps(__U,__P); 
 }
 __m256 test_mm256_mask_expandloadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm256_mask_expandloadu_ps
-  // CHECK: @llvm.x86.avx512.mask.expand.load.ps.256
+  // CHECK: @llvm.masked.expandload.v8f32(float* %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_mask_expandloadu_ps(__W,__U,__P); 
 }
 __m256 test_mm256_maskz_expandloadu_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm256_maskz_expandloadu_ps
-  // CHECK: @llvm.x86.avx512.mask.expand.load.ps.256
+  // CHECK: @llvm.masked.expandload.v8f32(float* %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_maskz_expandloadu_ps(__U,__P); 
 }
 __m128i test_mm_mask_expandloadu_epi32(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_mask_expandloadu_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand.load.d.128
+  // CHECK: @llvm.masked.expandload.v4i32(i32* %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_mask_expandloadu_epi32(__W,__U,__P); 
 }
 __m128i test_mm_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm_maskz_expandloadu_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand.load.d.128
+  // CHECK: @llvm.masked.expandload.v4i32(i32* %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maskz_expandloadu_epi32(__U,__P); 
 }
 __m256i test_mm256_mask_expandloadu_epi32(__m256i __W, __mmask8 __U,   void const *__P) {
   // CHECK-LABEL: @test_mm256_mask_expandloadu_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand.load.d.256
+  // CHECK: @llvm.masked.expandload.v8i32(i32* %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_mask_expandloadu_epi32(__W,__U,__P); 
 }
 __m256i test_mm256_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: @test_mm256_maskz_expandloadu_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand.load.d.256
+  // CHECK: @llvm.masked.expandload.v8i32(i32* %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_maskz_expandloadu_epi32(__U,__P); 
 }
 __m128 test_mm_mask_expand_ps(__m128 __W, __mmask8 __U, __m128 __A) {
@@ -3134,49 +5118,49 @@ void test_mm256_mask_i32scatter_epi32(int *__addr, __mmask8 __mask,  __m256i __i
 }
 __m128d test_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: @test_mm_mask_sqrt_pd
-  // CHECK: @llvm.x86.sse2.sqrt.pd
+  // CHECK: @llvm.sqrt.v2f64
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_sqrt_pd(__W,__U,__A); 
 }
 __m128d test_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
   // CHECK-LABEL: @test_mm_maskz_sqrt_pd
-  // CHECK: @llvm.x86.sse2.sqrt.pd
+  // CHECK: @llvm.sqrt.v2f64
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_sqrt_pd(__U,__A); 
 }
 __m256d test_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_sqrt_pd
-  // CHECK: @llvm.x86.avx.sqrt.pd.256
+  // CHECK: @llvm.sqrt.v4f64
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_sqrt_pd(__W,__U,__A); 
 }
 __m256d test_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_sqrt_pd
-  // CHECK: @llvm.x86.avx.sqrt.pd.256
+  // CHECK: @llvm.sqrt.v4f64
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_sqrt_pd(__U,__A); 
 }
 __m128 test_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_mask_sqrt_ps
-  // CHECK: @llvm.x86.sse.sqrt.ps
+  // CHECK: @llvm.sqrt.v4f32
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_sqrt_ps(__W,__U,__A); 
 }
 __m128 test_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_maskz_sqrt_ps
-  // CHECK: @llvm.x86.sse.sqrt.ps
+  // CHECK: @llvm.sqrt.v4f32
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_sqrt_ps(__U,__A); 
 }
 __m256 test_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_sqrt_ps
-  // CHECK: @llvm.x86.avx.sqrt.ps.256
+  // CHECK: @llvm.sqrt.v8f32
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_sqrt_ps(__W,__U,__A); 
 }
 __m256 test_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_maskz_sqrt_ps
-  // CHECK: @llvm.x86.avx.sqrt.ps.256
+  // CHECK: @llvm.sqrt.v8f32
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_sqrt_ps(__U,__A); 
 }
@@ -3230,162 +5214,186 @@ __m256 test_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
 }
 __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,  __m128i __B) {
   // CHECK-LABEL: @test_mm_mask2_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.d.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
 __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask2_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.d.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
 __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask2_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.pd.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask2_permutex2var_pd(__A,__I,__U,__B); 
 }
 __m256d test_mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,  __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask2_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.pd.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask2_permutex2var_pd(__A,__I,__U,__B); 
 }
 __m128 test_mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask2_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.ps.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask2_permutex2var_ps(__A,__I,__U,__B); 
 }
 __m256 test_mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,  __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask2_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.ps.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask2_permutex2var_ps(__A,__I,__U,__B); 
 }
 __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,  __m128i __B) {
   // CHECK-LABEL: @test_mm_mask2_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.q.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
 __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask2_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.q.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
 __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.d.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   return _mm_permutex2var_epi32(__A,__I,__B); 
 }
 __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.d.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
 __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,  __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.d.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
 __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.d.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   return _mm256_permutex2var_epi32(__A,__I,__B); 
 }
 __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.d.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
 __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.d.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
 __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: @test_mm_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   return _mm_permutex2var_pd(__A,__I,__B); 
 }
 __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
   // CHECK-LABEL: @test_mm_mask_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
 __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: @test_mm_maskz_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.pd.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
 __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
   // CHECK-LABEL: @test_mm256_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   return _mm256_permutex2var_pd(__A,__I,__B); 
 }
 __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) {
   // CHECK-LABEL: @test_mm256_mask_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
 __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,  __m256d __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.pd.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
 __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: @test_mm_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   return _mm_permutex2var_ps(__A,__I,__B); 
 }
 __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
   // CHECK-LABEL: @test_mm_mask_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
 __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: @test_mm_maskz_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.ps.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
 __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: @test_mm256_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   return _mm256_permutex2var_ps(__A,__I,__B); 
 }
 __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
   // CHECK-LABEL: @test_mm256_mask_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
 __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.ps.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
 __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.q.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   return _mm_permutex2var_epi64(__A,__I,__B); 
 }
 __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.q.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
 __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.q.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
 __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.q.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   return _mm256_permutex2var_epi64(__A,__I,__B); 
 }
 __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.q.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
 __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.q.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
 
@@ -3671,290 +5679,322 @@ __m256i test_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
 
 __m128i test_mm_rol_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  // CHECK: @llvm.x86.avx512.prol.d.128
   return _mm_rol_epi32(__A, 5); 
 }
 
 __m128i test_mm_mask_rol_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  // CHECK: @llvm.x86.avx512.prol.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_rol_epi32(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_rol_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  // CHECK: @llvm.x86.avx512.prol.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_rol_epi32(__U, __A, 5); 
 }
 
 __m256i test_mm256_rol_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm256_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  // CHECK: @llvm.x86.avx512.prol.d.256
   return _mm256_rol_epi32(__A, 5); 
 }
 
 __m256i test_mm256_mask_rol_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  // CHECK: @llvm.x86.avx512.prol.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_rol_epi32(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_rol_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_rol_epi32
-  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  // CHECK: @llvm.x86.avx512.prol.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_rol_epi32(__U, __A, 5); 
 }
 
 __m128i test_mm_rol_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  // CHECK: @llvm.x86.avx512.prol.q.128
   return _mm_rol_epi64(__A, 5); 
 }
 
 __m128i test_mm_mask_rol_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  // CHECK: @llvm.x86.avx512.prol.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_rol_epi64(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_rol_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  // CHECK: @llvm.x86.avx512.prol.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_rol_epi64(__U, __A, 5); 
 }
 
 __m256i test_mm256_rol_epi64(__m256i __A) {
   // CHECK-LABEL: @test_mm256_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  // CHECK: @llvm.x86.avx512.prol.q.256
   return _mm256_rol_epi64(__A, 5); 
 }
 
 __m256i test_mm256_mask_rol_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  // CHECK: @llvm.x86.avx512.prol.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_rol_epi64(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_rol_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_rol_epi64
-  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  // CHECK: @llvm.x86.avx512.prol.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_rol_epi64(__U, __A, 5); 
 }
 
 __m128i test_mm_rolv_epi32(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  // CHECK: @llvm.x86.avx512.prolv.d.128
   return _mm_rolv_epi32(__A, __B); 
 }
 
 __m128i test_mm_mask_rolv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  // CHECK: @llvm.x86.avx512.prolv.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_rolv_epi32(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_rolv_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  // CHECK: @llvm.x86.avx512.prolv.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_rolv_epi32(__U, __A, __B); 
 }
 
 __m256i test_mm256_rolv_epi32(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  // CHECK: @llvm.x86.avx512.prolv.d.256
   return _mm256_rolv_epi32(__A, __B); 
 }
 
 __m256i test_mm256_mask_rolv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  // CHECK: @llvm.x86.avx512.prolv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_rolv_epi32(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_rolv_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_rolv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  // CHECK: @llvm.x86.avx512.prolv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_rolv_epi32(__U, __A, __B); 
 }
 
 __m128i test_mm_rolv_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  // CHECK: @llvm.x86.avx512.prolv.q.128
   return _mm_rolv_epi64(__A, __B); 
 }
 
 __m128i test_mm_mask_rolv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  // CHECK: @llvm.x86.avx512.prolv.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_rolv_epi64(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_rolv_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  // CHECK: @llvm.x86.avx512.prolv.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_rolv_epi64(__U, __A, __B); 
 }
 
 __m256i test_mm256_rolv_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  // CHECK: @llvm.x86.avx512.prolv.q.256
   return _mm256_rolv_epi64(__A, __B); 
 }
 
 __m256i test_mm256_mask_rolv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  // CHECK: @llvm.x86.avx512.prolv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_rolv_epi64(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_rolv_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_rolv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  // CHECK: @llvm.x86.avx512.prolv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_rolv_epi64(__U, __A, __B); 
 }
 
 __m128i test_mm_ror_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  // CHECK: @llvm.x86.avx512.pror.d.128
   return _mm_ror_epi32(__A, 5); 
 }
 
 __m128i test_mm_mask_ror_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  // CHECK: @llvm.x86.avx512.pror.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_ror_epi32(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_ror_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  // CHECK: @llvm.x86.avx512.pror.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_ror_epi32(__U, __A, 5); 
 }
 
 __m256i test_mm256_ror_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm256_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  // CHECK: @llvm.x86.avx512.pror.d.256
   return _mm256_ror_epi32(__A, 5); 
 }
 
 __m256i test_mm256_mask_ror_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  // CHECK: @llvm.x86.avx512.pror.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_ror_epi32(__W, __U, __A, 5); 
 }
 
 __m256i test_mm256_maskz_ror_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_ror_epi32
-  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  // CHECK: @llvm.x86.avx512.pror.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_ror_epi32(__U, __A, 5); 
 }
 
 __m128i test_mm_ror_epi64(__m128i __A) {
   // CHECK-LABEL: @test_mm_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  // CHECK: @llvm.x86.avx512.pror.q.128
   return _mm_ror_epi64(__A, 5); 
 }
 
 __m128i test_mm_mask_ror_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  // CHECK: @llvm.x86.avx512.pror.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_ror_epi64(__W, __U, __A, 5); 
 }
 
 __m128i test_mm_maskz_ror_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  // CHECK: @llvm.x86.avx512.pror.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_ror_epi64(__U, __A, 5); 
 }
 
 __m256i test_mm256_ror_epi64(__m256i __A) {
   // CHECK-LABEL: @test_mm256_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  // CHECK: @llvm.x86.avx512.pror.q.256
   return _mm256_ror_epi64(__A, 5); 
 }
 
 __m256i test_mm256_mask_ror_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  // CHECK: @llvm.x86.avx512.pror.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_ror_epi64(__W, __U, __A,5); 
 }
 
 __m256i test_mm256_maskz_ror_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_ror_epi64
-  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  // CHECK: @llvm.x86.avx512.pror.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_ror_epi64(__U, __A, 5); 
 }
 
 
 __m128i test_mm_rorv_epi32(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  // CHECK: @llvm.x86.avx512.prorv.d.128
   return _mm_rorv_epi32(__A, __B); 
 }
 
 __m128i test_mm_mask_rorv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  // CHECK: @llvm.x86.avx512.prorv.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_rorv_epi32(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_rorv_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  // CHECK: @llvm.x86.avx512.prorv.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_rorv_epi32(__U, __A, __B); 
 }
 
 __m256i test_mm256_rorv_epi32(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  // CHECK: @llvm.x86.avx512.prorv.d.256
   return _mm256_rorv_epi32(__A, __B); 
 }
 
 __m256i test_mm256_mask_rorv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  // CHECK: @llvm.x86.avx512.prorv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_rorv_epi32(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_rorv_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_rorv_epi32
-  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  // CHECK: @llvm.x86.avx512.prorv.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_rorv_epi32(__U, __A, __B); 
 }
 
 __m128i test_mm_rorv_epi64(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  // CHECK: @llvm.x86.avx512.prorv.q.128
   return _mm_rorv_epi64(__A, __B); 
 }
 
 __m128i test_mm_mask_rorv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  // CHECK: @llvm.x86.avx512.prorv.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_rorv_epi64(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_rorv_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  // CHECK: @llvm.x86.avx512.prorv.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_rorv_epi64(__U, __A, __B); 
 }
 
 __m256i test_mm256_rorv_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  // CHECK: @llvm.x86.avx512.prorv.q.256
   return _mm256_rorv_epi64(__A, __B); 
 }
 
 __m256i test_mm256_mask_rorv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  // CHECK: @llvm.x86.avx512.prorv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_rorv_epi64(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_rorv_epi64
-  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  // CHECK: @llvm.x86.avx512.prorv.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_rorv_epi64(__U, __A, __B); 
 }
 
@@ -4560,7 +6600,6 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
   return _mm256_maskz_set1_epi32(__M, 5); 
 }
 
-#ifdef __x86_64__
 __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: @test_mm_mask_set1_epi64
   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
@@ -4600,7 +6639,6 @@ __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK:  select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_set1_epi64(__M, __A); 
 }
-#endif
 
 __m128d test_mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_fixupimm_pd
@@ -5076,56 +7114,56 @@ __m256 test_mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) {
 
 __m128d test_mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X) {
   // CHECK-LABEL: @test_mm_mask_permute_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 0>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_permute_pd(__W, __U, __X, 1); 
 }
 
 __m128d test_mm_maskz_permute_pd(__mmask8 __U, __m128d __X) {
   // CHECK-LABEL: @test_mm_maskz_permute_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 0>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_permute_pd(__U, __X, 1); 
 }
 
 __m256d test_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X) {
   // CHECK-LABEL: @test_mm256_mask_permute_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permute_pd(__W, __U, __X, 5); 
 }
 
 __m256d test_mm256_maskz_permute_pd(__mmask8 __U, __m256d __X) {
   // CHECK-LABEL: @test_mm256_maskz_permute_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permute_pd(__U, __X, 5); 
 }
 
 __m128 test_mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X) {
   // CHECK-LABEL: @test_mm_mask_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_permute_ps(__W, __U, __X, 0x1b); 
 }
 
 __m128 test_mm_maskz_permute_ps(__mmask8 __U, __m128 __X) {
   // CHECK-LABEL: @test_mm_maskz_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_permute_ps(__U, __X, 0x1b); 
 }
 
 __m256 test_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X) {
   // CHECK-LABEL: @test_mm256_mask_permute_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_permute_ps(__W, __U, __X, 0x1b); 
 }
 
 __m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) {
   // CHECK-LABEL: @test_mm256_maskz_permute_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_permute_ps(__U, __X, 0x1b); 
 }
@@ -5556,73 +7594,81 @@ __m256i test_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A) {
 
 __m128i test_mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.mask.pternlog.d.128
+  // CHECK: @llvm.x86.avx512.pternlog.d.128
   return _mm_ternarylogic_epi32(__A, __B, __C, 4); 
 }
 
 __m128i test_mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_mask_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.mask.pternlog.d.128
+  // CHECK: @llvm.x86.avx512.pternlog.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
 }
 
 __m128i test_mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_maskz_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.128
+  // CHECK: @llvm.x86.avx512.pternlog.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
   return _mm_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
 }
 
 __m256i test_mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: @test_mm256_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.mask.pternlog.d.256
+  // CHECK: @llvm.x86.avx512.pternlog.d.256
   return _mm256_ternarylogic_epi32(__A, __B, __C, 4); 
 }
 
 __m256i test_mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
   // CHECK-LABEL: @test_mm256_mask_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.mask.pternlog.d.256
+  // CHECK: @llvm.x86.avx512.pternlog.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
 }
 
 __m256i test_mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: @test_mm256_maskz_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.256
+  // CHECK: @llvm.x86.avx512.pternlog.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer
   return _mm256_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
 }
 
 __m128i test_mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.mask.pternlog.q.128
+  // CHECK: @llvm.x86.avx512.pternlog.q.128
   return _mm_ternarylogic_epi64(__A, __B, __C, 4); 
 }
 
 __m128i test_mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_mask_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.mask.pternlog.q.128
+  // CHECK: @llvm.x86.avx512.pternlog.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
 }
 
 __m128i test_mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: @test_mm_maskz_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.128
+  // CHECK: @llvm.x86.avx512.pternlog.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> zeroinitializer
   return _mm_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
 }
 
 __m256i test_mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: @test_mm256_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.mask.pternlog.q.256
+  // CHECK: @llvm.x86.avx512.pternlog.q.256
   return _mm256_ternarylogic_epi64(__A, __B, __C, 4); 
 }
 
 __m256i test_mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
   // CHECK-LABEL: @test_mm256_mask_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.mask.pternlog.q.256
+  // CHECK: @llvm.x86.avx512.pternlog.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
 }
 
 __m256i test_mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: @test_mm256_maskz_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.256
+  // CHECK: @llvm.x86.avx512.pternlog.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> zeroinitializer
   return _mm256_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
 }
 __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) {
@@ -5669,20 +7715,20 @@ __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) {
 
 __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i32x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   return _mm256_shuffle_i32x4(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); 
 }
@@ -6457,7 +8503,8 @@ void test_mm256_mask_cvtusepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i _
 
 __m128i test_mm_cvtepi32_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   return _mm_cvtepi32_epi8(__A); 
 }
 
@@ -6481,7 +8528,8 @@ void test_mm_mask_cvtepi32_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
 
 __m128i test_mm256_cvtepi32_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm256_cvtepi32_epi8(__A); 
 }
 
@@ -6505,7 +8553,8 @@ void test_mm256_mask_cvtepi32_storeu_epi8(void * __P, __mmask8 __M, __m256i __A)
 
 __m128i test_mm_cvtepi32_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm_cvtepi32_epi16(__A); 
 }
 
@@ -6529,7 +8578,7 @@ void test_mm_mask_cvtepi32_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
 
 __m128i test_mm256_cvtepi32_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
+  // CHECK: trunc <8 x i32> %{{.*}} to <8 x i16>
   return _mm256_cvtepi32_epi16(__A); 
 }
 
@@ -6553,7 +8602,8 @@ void test_mm256_mask_cvtepi32_storeu_epi16(void *  __P, __mmask8 __M, __m256i __
 
 __m128i test_mm_cvtepi64_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8>
+  // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   return _mm_cvtepi64_epi8(__A); 
 }
 
@@ -6577,7 +8627,8 @@ void test_mm_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
 
 __m128i test_mm256_cvtepi64_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   return _mm256_cvtepi64_epi8(__A); 
 }
 
@@ -6601,7 +8652,8 @@ void test_mm256_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m256i __A)
 
 __m128i test_mm_cvtepi64_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm_cvtepi64_epi32(__A); 
 }
 
@@ -6625,19 +8677,21 @@ void test_mm_mask_cvtepi64_storeu_epi32(void * __P, __mmask8 __M, __m128i __A) {
 
 __m128i test_mm256_cvtepi64_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
   return _mm256_cvtepi64_epi32(__A); 
 }
 
 __m128i test_mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_mask_cvtepi64_epi32(__O, __M, __A); 
 }
 
 __m128i test_mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_maskz_cvtepi64_epi32(__M, __A); 
 }
 
@@ -6649,7 +8703,8 @@ void test_mm256_mask_cvtepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A
 
 __m128i test_mm_cvtepi64_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16>
+  // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
   return _mm_cvtepi64_epi16(__A); 
 }
 
@@ -6673,7 +8728,8 @@ void test_mm_mask_cvtepi64_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
 
 __m128i test_mm256_cvtepi64_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_cvtepi64_epi16(__A); 
 }
 
@@ -6697,40 +8753,40 @@ void test_mm256_mask_cvtepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A
 
 __m128 test_mm256_extractf32x4_ps(__m256 __A) {
   // CHECK-LABEL: @test_mm256_extractf32x4_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf32x4_ps(__A, 1); 
 }
 
 __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_extractf32x4_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); 
 }
 
 __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_maskz_extractf32x4_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_maskz_extractf32x4_ps(__U, __A, 1); 
 }
 
 __m128i test_mm256_extracti32x4_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm256_extracti32x4_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extracti32x4_epi32(__A, 1); 
 }
 
 __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_extracti32x4_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); 
 }
 
 __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_extracti32x4_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); 
 }
@@ -6945,107 +9001,114 @@ __m256i test_mm256_mask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, __m25
 
 __m256d test_mm256_permutex_pd(__m256d __X) {
   // CHECK-LABEL: @test_mm256_permutex_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   return _mm256_permutex_pd(__X, 3);
 }
 
 __m256d test_mm256_mask_permutex_pd(__m256d __W, __mmask8 __U, __m256d __X) {
   // CHECK-LABEL: @test_mm256_mask_permutex_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permutex_pd(__W, __U, __X, 1);
 }
 
 __m256d test_mm256_maskz_permutex_pd(__mmask8 __U, __m256d __X) {
   // CHECK-LABEL: @test_mm256_maskz_permutex_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permutex_pd(__U, __X, 1);
 }
 
 __m256i test_mm256_permutex_epi64(__m256i __X) {
   // CHECK-LABEL: @test_mm256_permutex_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   return _mm256_permutex_epi64(__X, 3);
 }
 
 __m256i test_mm256_mask_permutex_epi64(__m256i __W, __mmask8 __M, __m256i __X) {
   // CHECK-LABEL: @test_mm256_mask_permutex_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_permutex_epi64(__W, __M, __X, 3);
 }
 
 __m256i test_mm256_maskz_permutex_epi64(__mmask8 __M, __m256i __X) {
   // CHECK-LABEL: @test_mm256_maskz_permutex_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_permutex_epi64(__M, __X, 3);
 }
 
 __m256d test_mm256_permutexvar_pd(__m256i __X, __m256d __Y) {
   // CHECK-LABEL: @test_mm256_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  // CHECK: @llvm.x86.avx512.permvar.df.256
   return _mm256_permutexvar_pd(__X, __Y);
 }
 
 __m256d test_mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, __m256d __Y) {
   // CHECK-LABEL: @test_mm256_mask_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  // CHECK: @llvm.x86.avx512.permvar.df.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permutexvar_pd(__W, __U, __X, __Y);
 }
 
 __m256d test_mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) {
   // CHECK-LABEL: @test_mm256_maskz_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  // CHECK: @llvm.x86.avx512.permvar.df.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permutexvar_pd(__U, __X, __Y);
 }
 
 __m256i test_mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.mask.permvar.di.256
+  // CHECK: @llvm.x86.avx512.permvar.di.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_permutexvar_epi64(__M, __X, __Y);
 }
 
 __m256i test_mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.mask.permvar.di.256
+  // CHECK: @llvm.x86.avx512.permvar.di.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_permutexvar_epi64(__W, __M, __X, __Y);
 }
 
 __m256 test_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) {
   // CHECK-LABEL: @test_mm256_mask_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  // CHECK: @llvm.x86.avx2.permps
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_permutexvar_ps(__W, __U, __X, __Y);
 }
 
 __m256 test_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) {
   // CHECK-LABEL: @test_mm256_maskz_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_permutexvar_ps(__U, __X, __Y);
 }
 
 __m256 test_mm256_permutexvar_ps(__m256i __X, __m256 __Y) {
   // CHECK-LABEL: @test_mm256_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  // CHECK: @llvm.x86.avx2.permps
   return _mm256_permutexvar_ps( __X, __Y);
 }
 
 __m256i test_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  // CHECK: @llvm.x86.avx2.permd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_permutexvar_epi32(__M, __X, __Y);
 }
 
 __m256i test_mm256_permutexvar_epi32(__m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  // CHECK: @llvm.x86.avx2.permd
   return _mm256_permutexvar_epi32(__X, __Y);
 }
 
 __m256i test_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: @test_mm256_mask_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  // CHECK: @llvm.x86.avx2.permd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_permutexvar_epi32(__W, __M, __X, __Y);
 }
 
@@ -7187,28 +9250,28 @@ __m256 test_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) {
 
 __m128i test_mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_shuffle_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_shuffle_epi32(__W, __U, __A, 1);
 }
 
 __m128i test_mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_shuffle_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_shuffle_epi32(__U, __A, 2);
 }
 
 __m256i test_mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shuffle_epi32(__W, __U, __A, 2);
 }
 
 __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shuffle_epi32(__U, __A, 2);
 }
diff --git a/test/CodeGen/avx512vlbitalg-builtins.c b/test/CodeGen/avx512vlbitalg-builtins.c
index 9b2a1a469b2a..3dd5b68fd463 100644
--- a/test/CodeGen/avx512vlbitalg-builtins.c
+++ b/test/CodeGen/avx512vlbitalg-builtins.c
@@ -21,23 +21,23 @@ __m256i test_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) {
   return _mm256_maskz_popcnt_epi16(__U, __B);
 }
 
-__m128i test_mm128_popcnt_epi16(__m128i __A) {
-  // CHECK-LABEL: @test_mm128_popcnt_epi16
+__m128i test_mm_popcnt_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_popcnt_epi16
   // CHECK: @llvm.ctpop.v8i16
-  return _mm128_popcnt_epi16(__A);
+  return _mm_popcnt_epi16(__A);
 }
 
-__m128i test_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_popcnt_epi16
+__m128i test_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_popcnt_epi16
   // CHECK: @llvm.ctpop.v8i16
   // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i16> {{.*}}
-  return _mm128_mask_popcnt_epi16(__A, __U, __B);
+  return _mm_mask_popcnt_epi16(__A, __U, __B);
 }
-__m128i test_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_popcnt_epi16
+__m128i test_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_popcnt_epi16
   // CHECK: @llvm.ctpop.v8i16
   // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i16> {{.*}}
-  return _mm128_maskz_popcnt_epi16(__U, __B);
+  return _mm_maskz_popcnt_epi16(__U, __B);
 }
 
 __m256i test_mm256_popcnt_epi8(__m256i __A) {
@@ -59,46 +59,46 @@ __m256i test_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) {
   return _mm256_maskz_popcnt_epi8(__U, __B);
 }
 
-__m128i test_mm128_popcnt_epi8(__m128i __A) {
-  // CHECK-LABEL: @test_mm128_popcnt_epi8
+__m128i test_mm_popcnt_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_popcnt_epi8
   // CHECK: @llvm.ctpop.v16i8
-  return _mm128_popcnt_epi8(__A);
+  return _mm_popcnt_epi8(__A);
 }
 
-__m128i test_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_popcnt_epi8
+__m128i test_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_popcnt_epi8
   // CHECK: @llvm.ctpop.v16i8
   // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}}
-  return _mm128_mask_popcnt_epi8(__A, __U, __B);
+  return _mm_mask_popcnt_epi8(__A, __U, __B);
 }
-__m128i test_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_popcnt_epi8
+__m128i test_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_popcnt_epi8
   // CHECK: @llvm.ctpop.v16i8
   // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}}
-  return _mm128_maskz_popcnt_epi8(__U, __B);
+  return _mm_maskz_popcnt_epi8(__U, __B);
 }
 
-__mmask32 test_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm256_mask_bitshuffle_epi32_mask
+__mmask32 test_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_bitshuffle_epi64_mask
   // CHECK: @llvm.x86.avx512.mask.vpshufbitqmb.256
-  return _mm256_mask_bitshuffle_epi32_mask(__U, __A, __B);
+  return _mm256_mask_bitshuffle_epi64_mask(__U, __A, __B);
 }
 
-__mmask32 test_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm256_bitshuffle_epi32_mask
+__mmask32 test_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_bitshuffle_epi64_mask
   // CHECK: @llvm.x86.avx512.mask.vpshufbitqmb.256
-  return _mm256_bitshuffle_epi32_mask(__A, __B);
+  return _mm256_bitshuffle_epi64_mask(__A, __B);
 }
 
-__mmask16 test_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_bitshuffle_epi16_mask
+__mmask16 test_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_bitshuffle_epi64_mask
   // CHECK: @llvm.x86.avx512.mask.vpshufbitqmb.128
-  return _mm128_mask_bitshuffle_epi16_mask(__U, __A, __B);
+  return _mm_mask_bitshuffle_epi64_mask(__U, __A, __B);
 }
 
-__mmask16 test_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_bitshuffle_epi16_mask
+__mmask16 test_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_bitshuffle_epi64_mask
   // CHECK: @llvm.x86.avx512.mask.vpshufbitqmb.128
-  return _mm128_bitshuffle_epi16_mask(__A, __B);
+  return _mm_bitshuffle_epi64_mask(__A, __B);
 }
 
diff --git a/test/CodeGen/avx512vlbw-builtins.c b/test/CodeGen/avx512vlbw-builtins.c
index 23fbd4026aaa..c28c3fbd7f81 100644
--- a/test/CodeGen/avx512vlbw-builtins.c
+++ b/test/CodeGen/avx512vlbw-builtins.c
@@ -974,7 +974,7 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packs_epi32(__M,__A,__B); 
 }
-__m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A,          __m128i __B) {
+__m128i test_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packs_epi32
   // CHECK: @llvm.x86.sse2.packssdw
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -986,7 +986,7 @@ __m256i test_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_packs_epi32(__M,__A,__B); 
 }
-__m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A,       __m256i __B) {
+__m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packs_epi32
   // CHECK: @llvm.x86.avx2.packssdw
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -998,7 +998,7 @@ __m128i test_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_packs_epi16(__M,__A,__B); 
 }
-__m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A,          __m128i __B) {
+__m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packs_epi16
   // CHECK: @llvm.x86.sse2.packsswb
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
@@ -1010,14 +1010,14 @@ __m256i test_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_packs_epi16(__M,__A,__B); 
 }
-__m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A,       __m256i __B) {
+__m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packs_epi16
   // CHECK: @llvm.x86.avx2.packsswb
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packs_epi16(__W,__M,__A,__B); 
 }
 
-__m128i test_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A,           __m128i __B) {
+__m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packus_epi32
   // CHECK: @llvm.x86.sse41.packusdw
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -1038,7 +1038,7 @@ __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   return _mm256_maskz_packus_epi32(__M,__A,__B); 
 }
 
-__m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A,        __m256i __B) {
+__m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packus_epi32
   // CHECK: @llvm.x86.avx2.packusdw
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -1052,7 +1052,7 @@ __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   return _mm_maskz_packus_epi16(__M,__A,__B); 
 }
 
-__m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A,           __m128i __B) {
+__m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packus_epi16
   // CHECK: @llvm.x86.sse2.packuswb
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
@@ -1066,14 +1066,14 @@ __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   return _mm256_maskz_packus_epi16(__M,__A,__B); 
 }
 
-__m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A,        __m256i __B) {
+__m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_packus_epi16
   // CHECK: @llvm.x86.avx2.packuswb
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packus_epi16(__W,__M,__A,__B); 
 }
 
-__m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epi8
   // CHECK: @llvm.x86.sse2.padds.b
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
@@ -1085,7 +1085,7 @@ __m128i test_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epi8(__U,__A,__B); 
 }
-__m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epi8
   // CHECK: @llvm.x86.avx2.padds.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -1097,7 +1097,7 @@ __m256i test_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epi8(__U,__A,__B); 
 }
-__m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
+__m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epi16
   // CHECK: @llvm.x86.sse2.padds.w
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -1109,7 +1109,7 @@ __m128i test_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epi16(__U,__A,__B); 
 }
-__m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
+__m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epi16
   // CHECK: @llvm.x86.avx2.padds.w
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -1121,7 +1121,7 @@ __m256i test_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epi16(__U,__A,__B); 
 }
-__m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epu8
   // CHECK: @llvm.x86.sse2.paddus.b
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
@@ -1133,7 +1133,7 @@ __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epu8(__U,__A,__B); 
 }
-__m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epu8
   // CHECK: @llvm.x86.avx2.paddus.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -1145,7 +1145,7 @@ __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epu8(__U,__A,__B); 
 }
-__m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
+__m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_adds_epu16
   // CHECK: @llvm.x86.sse2.paddus.w
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -1157,7 +1157,7 @@ __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epu16(__U,__A,__B); 
 }
-__m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
+__m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_adds_epu16
   // CHECK: @llvm.x86.avx2.paddus.w
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -1169,7 +1169,7 @@ __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epu16(__U,__A,__B); 
 }
-__m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A,       __m128i __B) {
+__m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_avg_epu8
   // CHECK-NOT: @llvm.x86.sse2.pavg.b
   // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
@@ -1194,7 +1194,7 @@ __m128i test_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_avg_epu8(__U,__A,__B); 
 }
-__m256i test_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A,          __m256i __B) {
+__m256i test_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_avg_epu8
   // CHECK-NOT: @llvm.x86.avx2.pavg.b
   // CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
@@ -1219,7 +1219,7 @@ __m256i test_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_avg_epu8(__U,__A,__B); 
 }
-__m128i test_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_avg_epu16
   // CHECK-NOT: @llvm.x86.sse2.pavg.w
   // CHECK: zext <8 x i16> %{{.*}} to <8 x i32>
@@ -1244,7 +1244,7 @@ __m128i test_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_avg_epu16(__U,__A,__B); 
 }
-__m256i test_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_avg_epu16
   // CHECK-NOT: @llvm.x86.avx2.pavg.w
   // CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
@@ -1276,7 +1276,7 @@ __m128i test_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_max_epi8(__M,__A,__B); 
 }
-__m128i test_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
+__m128i test_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi8
   // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
@@ -1290,7 +1290,7 @@ __m256i test_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_max_epi8(__M,__A,__B); 
 }
-__m256i test_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
+__m256i test_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi8
   // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
@@ -1304,7 +1304,7 @@ __m128i test_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_max_epi16(__M,__A,__B); 
 }
-__m128i test_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epi16
   // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
@@ -1318,7 +1318,7 @@ __m256i test_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_max_epi16(__M,__A,__B); 
 }
-__m256i test_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epi16
   // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
@@ -1332,7 +1332,7 @@ __m128i test_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_max_epu8(__M,__A,__B); 
 }
-__m128i test_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
+__m128i test_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu8
   // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
@@ -1346,7 +1346,7 @@ __m256i test_mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_max_epu8(__M,__A,__B); 
 }
-__m256i test_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
+__m256i test_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu8
   // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
@@ -1360,7 +1360,7 @@ __m128i test_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_max_epu16(__M,__A,__B); 
 }
-__m128i test_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_max_epu16
   // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
@@ -1374,7 +1374,7 @@ __m256i test_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_max_epu16(__M,__A,__B); 
 }
-__m256i test_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_max_epu16
   // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
@@ -1388,7 +1388,7 @@ __m128i test_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_min_epi8(__M,__A,__B); 
 }
-__m128i test_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
+__m128i test_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi8
   // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
@@ -1402,7 +1402,7 @@ __m256i test_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_min_epi8(__M,__A,__B); 
 }
-__m256i test_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
+__m256i test_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi8
   // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
@@ -1416,7 +1416,7 @@ __m128i test_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_min_epi16(__M,__A,__B); 
 }
-__m128i test_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epi16
   // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
@@ -1430,7 +1430,7 @@ __m256i test_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_min_epi16(__M,__A,__B); 
 }
-__m256i test_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epi16
   // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
@@ -1444,7 +1444,7 @@ __m128i test_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i8> [[RES]], <16 x i8> {{.*}}
   return _mm_maskz_min_epu8(__M,__A,__B); 
 }
-__m128i test_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A,       __m128i __B) {
+__m128i test_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu8
   // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
@@ -1458,7 +1458,7 @@ __m256i test_mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <32 x i1> {{.*}}, <32 x i8> [[RES]], <32 x i8> {{.*}}
   return _mm256_maskz_min_epu8(__M,__A,__B); 
 }
-__m256i test_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A,          __m256i __B) {
+__m256i test_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu8
   // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
@@ -1472,7 +1472,7 @@ __m128i test_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK:       select <8 x i1> {{.*}}, <8 x i16> [[RES]], <8 x i16> {{.*}}
   return _mm_maskz_min_epu16(__M,__A,__B); 
 }
-__m128i test_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_min_epu16
   // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
@@ -1486,14 +1486,14 @@ __m256i test_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_maskz_min_epu16(__M,__A,__B); 
 }
-__m256i test_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_min_epu16
   // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
   // CHECK-NEXT:  [[RES:%.*]] = select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
   // CHECK:       select <16 x i1> {{.*}}, <16 x i16> [[RES]], <16 x i16> {{.*}}
   return _mm256_mask_min_epu16(__W,__M,__A,__B); 
 }
-__m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A,           __m128i __B) {
+__m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_shuffle_epi8
   // CHECK: @llvm.x86.ssse3.pshuf.b
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
@@ -1505,7 +1505,7 @@ __m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_shuffle_epi8(__U,__A,__B); 
 }
-__m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A,        __m256i __B) {
+__m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_epi8
   // CHECK: @llvm.x86.avx2.pshuf.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -1517,7 +1517,7 @@ __m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_shuffle_epi8(__U,__A,__B); 
 }
-__m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epi8
   // CHECK: @llvm.x86.sse2.psubs.b
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
@@ -1529,7 +1529,7 @@ __m128i test_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epi8(__U,__A,__B); 
 }
-__m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epi8
   // CHECK: @llvm.x86.avx2.psubs.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -1541,7 +1541,7 @@ __m256i test_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epi8(__U,__A,__B); 
 }
-__m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
+__m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epi16
   // CHECK: @llvm.x86.sse2.psubs.w
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -1553,7 +1553,7 @@ __m128i test_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epi16(__U,__A,__B); 
 }
-__m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
+__m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epi16
   // CHECK: @llvm.x86.avx2.psubs.w
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -1565,7 +1565,7 @@ __m256i test_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_subs_epi16(__U,__A,__B); 
 }
-__m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A,        __m128i __B) {
+__m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epu8
   // CHECK: @llvm.x86.sse2.psubus.b
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
@@ -1577,7 +1577,7 @@ __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epu8(__U,__A,__B); 
 }
-__m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A,           __m256i __B) {
+__m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epu8
   // CHECK: @llvm.x86.avx2.psubus.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
@@ -1589,7 +1589,7 @@ __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epu8(__U,__A,__B); 
 }
-__m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A,         __m128i __B) {
+__m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_subs_epu16
   // CHECK: @llvm.x86.sse2.psubus.w
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -1601,7 +1601,7 @@ __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epu16(__U,__A,__B); 
 }
-__m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,      __m256i __B) {
+__m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_subs_epu16
   // CHECK: @llvm.x86.avx2.psubus.w
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -1615,45 +1615,51 @@ __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
 }
 
 
-__m128i test_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,            __m128i __B) {
+__m128i test_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask2_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.hi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask2_permutex2var_epi16(__A,__I,__U,__B); 
 }
-__m256i test_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I,         __mmask16 __U, __m256i __B) {
+__m256i test_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask2_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermi2var.hi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask2_permutex2var_epi16(__A,__I,__U,__B); 
 }
 __m128i test_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.hi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.128
   return _mm_permutex2var_epi16(__A,__I,__B); 
 }
-__m128i test_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,           __m128i __B) {
+__m128i test_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.hi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_permutex2var_epi16(__A,__U,__I,__B); 
 }
-__m128i test_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I,            __m128i __B) {
+__m128i test_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.hi.128
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
 
 __m256i test_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.hi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.256
   return _mm256_permutex2var_epi16(__A,__I,__B); 
 }
-__m256i test_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U,        __m256i __I, __m256i __B) {
+__m256i test_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpermt2var.hi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_permutex2var_epi16(__A,__U,__I,__B); 
 }
-__m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A,         __m256i __I, __m256i __B) {
+__m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutex2var_epi16
-  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.hi.256
+  // CHECK: @llvm.x86.avx512.vpermi2var.hi.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
 __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
@@ -1786,7 +1792,8 @@ __m128i test_mm256_maskz_cvtusepi16_epi8(__mmask16 __M, __m256i __A) {
 
 __m128i test_mm_cvtepi16_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.128
+  // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm_cvtepi16_epi8(__A); 
 }
 
@@ -1804,19 +1811,21 @@ __m128i test_mm_maskz_cvtepi16_epi8(__mmask8 __M, __m128i __A) {
 
 __m128i test_mm256_cvtepi16_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.256
+  // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8>
   return _mm256_cvtepi16_epi8(__A); 
 }
 
 __m128i test_mm256_mask_cvtepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.256
+  // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm256_mask_cvtepi16_epi8(__O, __M, __A); 
 }
 
 __m128i test_mm256_maskz_cvtepi16_epi8(__mmask16 __M, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.256
+  // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm256_maskz_cvtepi16_epi8(__M, __A); 
 }
 
@@ -2016,7 +2025,7 @@ __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B)
   return _mm256_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
-__m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+__m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi8_epi16
   // CHECK: sext <8 x i8> %{{.*}} to <8 x i16>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -2030,7 +2039,7 @@ __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
   return _mm_maskz_cvtepi8_epi16(__U, __A); 
 }
 
-__m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
+__m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi16
   // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -2044,7 +2053,7 @@ __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
   return _mm256_maskz_cvtepi8_epi16(__U, __A); 
 }
 
-__m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+__m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu8_epi16
   // CHECK: zext <8 x i8> %{{.*}} to <8 x i16>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
@@ -2058,7 +2067,7 @@ __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
   return _mm_maskz_cvtepu8_epi16(__U, __A); 
 }
 
-__m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
+__m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi16
   // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
@@ -2601,13 +2610,15 @@ __mmask16 test_mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i _
 
 __mmask16 test_mm_movepi8_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi8_mask
-  // CHECK: @llvm.x86.avx512.cvtb2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <16 x i8> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <16 x i1> [[CMP]] to i16
   return _mm_movepi8_mask(__A); 
 }
 
 __mmask32 test_mm256_movepi8_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi8_mask
-  // CHECK: @llvm.x86.avx512.cvtb2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <32 x i8> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <32 x i1> [[CMP]] to i32
   return _mm256_movepi8_mask(__A); 
 }
 
@@ -2887,37 +2898,41 @@ __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
 }
 __m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  // CHECK: @llvm.x86.avx512.permvar.hi.128
   return _mm_permutexvar_epi16(__A, __B); 
 }
 
 __m128i test_mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  // CHECK: @llvm.x86.avx512.permvar.hi.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_permutexvar_epi16(__M, __A, __B); 
 }
 
 __m128i test_mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  // CHECK: @llvm.x86.avx512.permvar.hi.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_permutexvar_epi16(__W, __M, __A, __B); 
 }
 
 __m256i test_mm256_permutexvar_epi16(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  // CHECK: @llvm.x86.avx512.permvar.hi.256
   return _mm256_permutexvar_epi16(__A, __B); 
 }
 
 __m256i test_mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  // CHECK: @llvm.x86.avx512.permvar.hi.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_permutexvar_epi16(__M, __A, __B); 
 }
 
 __m256i test_mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_permutexvar_epi16
-  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  // CHECK: @llvm.x86.avx512.permvar.hi.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_permutexvar_epi16(__W, __M, __A, __B); 
 }
 __m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
@@ -2950,103 +2965,109 @@ __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
 
 __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  // CHECK: @llvm.x86.avx512.dbpsadbw.128
   return _mm_dbsad_epu8(__A, __B, 170); 
 }
 
 __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  // CHECK: @llvm.x86.avx512.dbpsadbw.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
 }
 
 __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  // CHECK: @llvm.x86.avx512.dbpsadbw.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); 
 }
 
 __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  // CHECK: @llvm.x86.avx512.dbpsadbw.256
   return _mm256_dbsad_epu8(__A, __B, 170); 
 }
 
 __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  // CHECK: @llvm.x86.avx512.dbpsadbw.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
 }
 
 __m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_dbsad_epu8
-  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  // CHECK: @llvm.x86.avx512.dbpsadbw.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); 
 }
 __mmask8 test_mm_movepi16_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi16_mask
-  // CHECK: @llvm.x86.avx512.cvtw2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <8 x i16> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <8 x i1> [[CMP]] to i8
   return _mm_movepi16_mask(__A); 
 }
 
 __mmask16 test_mm256_movepi16_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi16_mask
-  // CHECK: @llvm.x86.avx512.cvtw2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <16 x i16> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <16 x i1> [[CMP]] to i16
   return _mm256_movepi16_mask(__A); 
 }
 
-__m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+__m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_shufflehi_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_shufflehi_epi16(__W, __U, __A, 5); 
 }
 
-__m128i test_mm_maskz_shufflehi_epi16(__mmask32 __U, __m128i __A) {
+__m128i test_mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_shufflehi_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_shufflehi_epi16(__U, __A, 5); 
 }
 
-__m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+__m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_shufflelo_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_shufflelo_epi16(__W, __U, __A, 5); 
 }
 
-__m128i test_mm_maskz_shufflelo_epi16(__mmask32 __U, __m128i __A) {
+__m128i test_mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_shufflelo_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_shufflelo_epi16(__U, __A, 5); 
 }
 
-__m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask32 __U, __m256i __A) {
+__m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_shufflehi_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_shufflehi_epi16(__W, __U, __A, 5); 
 }
 
-__m256i test_mm256_maskz_shufflehi_epi16(__mmask32 __U, __m256i __A) {
+__m256i test_mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_shufflehi_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_shufflehi_epi16(__U, __A, 5); 
 }
 
-__m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask32 __U, __m256i __A) {
+__m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_shufflelo_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_shufflelo_epi16(__W, __U, __A, 5); 
 }
 
-__m256i test_mm256_maskz_shufflelo_epi16(__mmask32 __U, __m256i __A) {
+__m256i test_mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_shufflelo_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_shufflelo_epi16(__U, __A, 5); 
 }
diff --git a/test/CodeGen/avx512vldq-builtins.c b/test/CodeGen/avx512vldq-builtins.c
index 3ca4b2135ea7..b21b665eb9be 100644
--- a/test/CodeGen/avx512vldq-builtins.c
+++ b/test/CodeGen/avx512vldq-builtins.c
@@ -421,37 +421,41 @@ __m256i test_mm256_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) {
 
 __m128d test_mm_cvtepi64_pd(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.128
+  // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
   return _mm_cvtepi64_pd(__A); 
 }
 
 __m128d test_mm_mask_cvtepi64_pd(__m128d __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.128
+  // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_cvtepi64_pd(__W, __U, __A); 
 }
 
 __m128d test_mm_maskz_cvtepi64_pd(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.128
+  // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_cvtepi64_pd(__U, __A); 
 }
 
 __m256d test_mm256_cvtepi64_pd(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.256
+  // CHECK: sitofp <4 x i64> %{{.*}} to <4 x double>
   return _mm256_cvtepi64_pd(__A); 
 }
 
 __m256d test_mm256_mask_cvtepi64_pd(__m256d __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.256
+  // CHECK: sitofp <4 x i64> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_cvtepi64_pd(__W, __U, __A); 
 }
 
 __m256d test_mm256_maskz_cvtepi64_pd(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepi64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtqq2pd.256
+  // CHECK: sitofp <4 x i64> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_cvtepi64_pd(__U, __A); 
 }
 
@@ -637,37 +641,41 @@ __m256i test_mm256_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) {
 
 __m128d test_mm_cvtepu64_pd(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.128
+  // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
   return _mm_cvtepu64_pd(__A); 
 }
 
 __m128d test_mm_mask_cvtepu64_pd(__m128d __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.128
+  // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_cvtepu64_pd(__W, __U, __A); 
 }
 
 __m128d test_mm_maskz_cvtepu64_pd(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.128
+  // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_cvtepu64_pd(__U, __A); 
 }
 
 __m256d test_mm256_cvtepu64_pd(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.256
+  // CHECK: uitofp <4 x i64> %{{.*}} to <4 x double>
   return _mm256_cvtepu64_pd(__A); 
 }
 
 __m256d test_mm256_mask_cvtepu64_pd(__m256d __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.256
+  // CHECK: uitofp <4 x i64> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_cvtepu64_pd(__W, __U, __A); 
 }
 
 __m256d test_mm256_maskz_cvtepu64_pd(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtepu64_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtuqq2pd.256
+  // CHECK: uitofp <4 x i64> %{{.*}} to <4 x double>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_cvtepu64_pd(__U, __A); 
 }
 
@@ -853,13 +861,16 @@ __m256 test_mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A) {
 
 __mmask8 test_mm_movepi32_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi32_mask
-  // CHECK: @llvm.x86.avx512.cvtd2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <4 x i32> %{{.*}}, zeroinitializer
+  // CHECK: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: bitcast <8 x i1> [[SHUF]] to i8
   return _mm_movepi32_mask(__A); 
 }
 
 __mmask8 test_mm256_movepi32_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi32_mask
-  // CHECK: @llvm.x86.avx512.cvtd2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <8 x i32> %{{.*}}, zeroinitializer
+  // CHECK: bitcast <8 x i1> [[CMP]] to i8
   return _mm256_movepi32_mask(__A); 
 }
 
@@ -896,33 +907,37 @@ __m256i test_mm256_movm_epi64(__mmask8 __A) {
 
 __mmask8 test_mm_movepi64_mask(__m128i __A) {
   // CHECK-LABEL: @test_mm_movepi64_mask
-  // CHECK: @llvm.x86.avx512.cvtq2mask.128
+  // CHECK: [[CMP:%.*]] = icmp slt <2 x i64> %{{.*}}, zeroinitializer
+  // CHECK: [[SHUF:%.*]] = shufflevector <2 x i1> [[CMP]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  // CHECK: bitcast <8 x i1> [[SHUF]] to i8
   return _mm_movepi64_mask(__A); 
 }
 
 __mmask8 test_mm256_movepi64_mask(__m256i __A) {
   // CHECK-LABEL: @test_mm256_movepi64_mask
-  // CHECK: @llvm.x86.avx512.cvtq2mask.256
+  // CHECK: [[CMP:%.*]] = icmp slt <4 x i64> %{{.*}}, zeroinitializer
+  // CHECK: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: bitcast <8 x i1> [[SHUF]] to i8
   return _mm256_movepi64_mask(__A); 
 }
 
 
 __m256 test_mm256_broadcast_f32x2(__m128 __A) {
   // CHECK-LABEL: @test_mm256_broadcast_f32x2
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_f32x2(__A); 
 }
 
 __m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
   // CHECK-LABEL: @test_mm256_mask_broadcast_f32x2
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_broadcast_f32x2(__O, __M, __A); 
 }
 
 __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
   // CHECK-LABEL: @test_mm256_maskz_broadcast_f32x2
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_broadcast_f32x2(__M, __A); 
 }
@@ -949,40 +964,40 @@ __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
 
 __m128i test_mm_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: @test_mm_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm_broadcast_i32x2(__A); 
 }
 
 __m128i test_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
 __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_broadcast_i32x2(__M, __A); 
 }
 
 __m256i test_mm256_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: @test_mm256_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_i32x2(__A); 
 }
 
 __m256i test_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
 __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_broadcast_i32x2
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_broadcast_i32x2(__M, __A); 
 }
@@ -1009,40 +1024,40 @@ __m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) {
 
 __m128d test_mm256_extractf64x2_pd(__m256d __A) {
   // CHECK-LABEL: @test_mm256_extractf64x2_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf64x2_pd(__A, 1); 
 }
 
 __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_extractf64x2_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); 
 }
 
 __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_extractf64x2_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_maskz_extractf64x2_pd(__U, __A, 1); 
 }
 
 __m128i test_mm256_extracti64x2_epi64(__m256i __A) {
   // CHECK-LABEL: @test_mm256_extracti64x2_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti64x2_epi64(__A, 1); 
 }
 
 __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_extracti64x2_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); 
 }
 
 __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_extracti64x2_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); 
 }
@@ -1089,48 +1104,48 @@ __m256i test_mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B) {
 
 __mmask8 test_mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A) {
   // CHECK-LABEL: @test_mm_mask_fpclass_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.128
+  // CHECK: @llvm.x86.avx512.fpclass.pd.128
   return _mm_mask_fpclass_pd_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm_fpclass_pd_mask(__m128d __A) {
   // CHECK-LABEL: @test_mm_fpclass_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.128
+  // CHECK: @llvm.x86.avx512.fpclass.pd.128
   return _mm_fpclass_pd_mask(__A, 2); 
 }
 
 __mmask8 test_mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_fpclass_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.256
+  // CHECK: @llvm.x86.avx512.fpclass.pd.256
   return _mm256_mask_fpclass_pd_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm256_fpclass_pd_mask(__m256d __A) {
   // CHECK-LABEL: @test_mm256_fpclass_pd_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.256
+  // CHECK: @llvm.x86.avx512.fpclass.pd.256
   return _mm256_fpclass_pd_mask(__A, 2); 
 }
 
 __mmask8 test_mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: @test_mm_mask_fpclass_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.128
+  // CHECK: @llvm.x86.avx512.fpclass.ps.128
   return _mm_mask_fpclass_ps_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm_fpclass_ps_mask(__m128 __A) {
   // CHECK-LABEL: @test_mm_fpclass_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.128
+  // CHECK: @llvm.x86.avx512.fpclass.ps.128
   return _mm_fpclass_ps_mask(__A, 2); 
 }
 
 __mmask8 test_mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_fpclass_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.256
+  // CHECK: @llvm.x86.avx512.fpclass.ps.256
   return _mm256_mask_fpclass_ps_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm256_fpclass_ps_mask(__m256 __A) {
   // CHECK-LABEL: @test_mm256_fpclass_ps_mask
-  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.256
+  // CHECK: @llvm.x86.avx512.fpclass.ps.256
   return _mm256_fpclass_ps_mask(__A, 2); 
 }
diff --git a/test/CodeGen/avx512vlvbmi2-builtins.c b/test/CodeGen/avx512vlvbmi2-builtins.c
index 6edc66d30eee..aceb97616d2b 100644
--- a/test/CodeGen/avx512vlvbmi2-builtins.c
+++ b/test/CodeGen/avx512vlvbmi2-builtins.c
@@ -2,88 +2,88 @@
 
 #include <immintrin.h>
 
-__m128i test_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_mask_compress_epi16
+__m128i test_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_mask_compress_epi16
   // CHECK: @llvm.x86.avx512.mask.compress.w.128
-  return _mm128_mask_compress_epi16(__S, __U, __D);
+  return _mm_mask_compress_epi16(__S, __U, __D);
 }
 
-__m128i test_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_maskz_compress_epi16
+__m128i test_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_maskz_compress_epi16
   // CHECK: @llvm.x86.avx512.mask.compress.w.128
-  return _mm128_maskz_compress_epi16(__U, __D);
+  return _mm_maskz_compress_epi16(__U, __D);
 }
 
-__m128i test_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_mask_compress_epi8
+__m128i test_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_mask_compress_epi8
   // CHECK: @llvm.x86.avx512.mask.compress.b.128
-  return _mm128_mask_compress_epi8(__S, __U, __D);
+  return _mm_mask_compress_epi8(__S, __U, __D);
 }
 
-__m128i test_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_maskz_compress_epi8
+__m128i test_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_maskz_compress_epi8
   // CHECK: @llvm.x86.avx512.mask.compress.b.128
-  return _mm128_maskz_compress_epi8(__U, __D);
+  return _mm_maskz_compress_epi8(__U, __D);
 }
 
-void test_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_mask_compressstoreu_epi16
-  // CHECK: @llvm.x86.avx512.mask.compress.store.w.128
-  _mm128_mask_compressstoreu_epi16(__P, __U, __D);
+void test_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_mask_compressstoreu_epi16
+  // CHECK: @llvm.masked.compressstore.v8i16(<8 x i16> %{{.*}}, i16* %{{.*}}, <8 x i1> %{{.*}})
+  _mm_mask_compressstoreu_epi16(__P, __U, __D);
 }
 
-void test_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_mask_compressstoreu_epi8
-  // CHECK: @llvm.x86.avx512.mask.compress.store.b.128
-  _mm128_mask_compressstoreu_epi8(__P, __U, __D);
+void test_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_mask_compressstoreu_epi8
+  // CHECK: @llvm.masked.compressstore.v16i8(<16 x i8> %{{.*}}, i8* %{{.*}}, <16 x i1> %{{.*}})
+  _mm_mask_compressstoreu_epi8(__P, __U, __D);
 }
 
-__m128i test_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_mask_expand_epi16
+__m128i test_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_mask_expand_epi16
   // CHECK: @llvm.x86.avx512.mask.expand.w.128
-  return _mm128_mask_expand_epi16(__S, __U, __D);
+  return _mm_mask_expand_epi16(__S, __U, __D);
 }
 
-__m128i test_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_maskz_expand_epi16
+__m128i test_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_maskz_expand_epi16
   // CHECK: @llvm.x86.avx512.mask.expand.w.128
-  return _mm128_maskz_expand_epi16(__U, __D);
+  return _mm_maskz_expand_epi16(__U, __D);
 }
 
-__m128i test_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_mask_expand_epi8
+__m128i test_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_mask_expand_epi8
   // CHECK: @llvm.x86.avx512.mask.expand.b.128
-  return _mm128_mask_expand_epi8(__S, __U, __D);
+  return _mm_mask_expand_epi8(__S, __U, __D);
 }
 
-__m128i test_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D) {
-  // CHECK-LABEL: @test_mm128_maskz_expand_epi8
+__m128i test_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) {
+  // CHECK-LABEL: @test_mm_maskz_expand_epi8
   // CHECK: @llvm.x86.avx512.mask.expand.b.128
-  return _mm128_maskz_expand_epi8(__U, __D);
+  return _mm_maskz_expand_epi8(__U, __D);
 }
 
-__m128i test_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const* __P) {
-  // CHECK-LABEL: @test_mm128_mask_expandloadu_epi16
-  // CHECK: @llvm.x86.avx512.mask.expand.load.w.128
-  return _mm128_mask_expandloadu_epi16(__S, __U, __P);
+__m128i test_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const* __P) {
+  // CHECK-LABEL: @test_mm_mask_expandloadu_epi16
+  // CHECK: @llvm.masked.expandload.v8i16(i16* %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm_mask_expandloadu_epi16(__S, __U, __P);
 }
 
-__m128i test_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const* __P) {
-  // CHECK-LABEL: @test_mm128_maskz_expandloadu_epi16
-  // CHECK: @llvm.x86.avx512.mask.expand.load.w.128
-  return _mm128_maskz_expandloadu_epi16(__U, __P);
+__m128i test_mm_maskz_expandloadu_epi16(__mmask8 __U, void const* __P) {
+  // CHECK-LABEL: @test_mm_maskz_expandloadu_epi16
+  // CHECK: @llvm.masked.expandload.v8i16(i16* %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm_maskz_expandloadu_epi16(__U, __P);
 }
 
-__m128i test_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const* __P) {
-  // CHECK-LABEL: @test_mm128_mask_expandloadu_epi8
-  // CHECK: @llvm.x86.avx512.mask.expand.load.b.128
-  return _mm128_mask_expandloadu_epi8(__S, __U, __P);
+__m128i test_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const* __P) {
+  // CHECK-LABEL: @test_mm_mask_expandloadu_epi8
+  // CHECK: @llvm.masked.expandload.v16i8(i8* %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  return _mm_mask_expandloadu_epi8(__S, __U, __P);
 }
 
-__m128i test_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const* __P) {
-  // CHECK-LABEL: @test_mm128_maskz_expandloadu_epi8
-  // CHECK: @llvm.x86.avx512.mask.expand.load.b.128
-  return _mm128_maskz_expandloadu_epi8(__U, __P);
+__m128i test_mm_maskz_expandloadu_epi8(__mmask16 __U, void const* __P) {
+  // CHECK-LABEL: @test_mm_maskz_expandloadu_epi8
+  // CHECK: @llvm.masked.expandload.v16i8(i8* %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  return _mm_maskz_expandloadu_epi8(__U, __P);
 }
 
 __m256i test_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) {
@@ -112,13 +112,13 @@ __m256i test_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) {
 
 void test_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) {
   // CHECK-LABEL: @test_mm256_mask_compressstoreu_epi16
-  // CHECK: @llvm.x86.avx512.mask.compress.store.w.256
+  // CHECK: @llvm.masked.compressstore.v16i16(<16 x i16> %{{.*}}, i16* %{{.*}}, <16 x i1> %{{.*}})
   _mm256_mask_compressstoreu_epi16(__P, __U, __D);
 }
 
 void test_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) {
   // CHECK-LABEL: @test_mm256_mask_compressstoreu_epi8
-  // CHECK: @llvm.x86.avx512.mask.compress.store.b.256
+  // CHECK: @llvm.masked.compressstore.v32i8(<32 x i8> %{{.*}}, i8* %{{.*}}, <32 x i1> %{{.*}})
   _mm256_mask_compressstoreu_epi8(__P, __U, __D);
 }
 
@@ -148,242 +148,266 @@ __m256i test_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) {
 
 __m256i test_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const* __P) {
   // CHECK-LABEL: @test_mm256_mask_expandloadu_epi16
-  // CHECK: @llvm.x86.avx512.mask.expand.load.w.256
+  // CHECK: @llvm.masked.expandload.v16i16(i16* %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_mask_expandloadu_epi16(__S, __U, __P);
 }
 
 __m256i test_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const* __P) {
   // CHECK-LABEL: @test_mm256_maskz_expandloadu_epi16
-  // CHECK: @llvm.x86.avx512.mask.expand.load.w.256
+  // CHECK: @llvm.masked.expandload.v16i16(i16* %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_maskz_expandloadu_epi16(__U, __P);
 }
 
 __m256i test_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const* __P) {
   // CHECK-LABEL: @test_mm256_mask_expandloadu_epi8
-  // CHECK: @llvm.x86.avx512.mask.expand.load.b.256
+  // CHECK: @llvm.masked.expandload.v32i8(i8* %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_mask_expandloadu_epi8(__S, __U, __P);
 }
 
 __m256i test_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const* __P) {
   // CHECK-LABEL: @test_mm256_maskz_expandloadu_epi8
-  // CHECK: @llvm.x86.avx512.mask.expand.load.b.256
+  // CHECK: @llvm.masked.expandload.v32i8(i8* %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_maskz_expandloadu_epi8(__U, __P);
 }
 
 __m256i test_mm256_mask_shldi_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.256
+  // CHECK: @llvm.x86.avx512.vpshld.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_shldi_epi64(__S, __U, __A, __B, 127);
 }
 
 __m256i test_mm256_maskz_shldi_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.256
+  // CHECK: @llvm.x86.avx512.vpshld.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_shldi_epi64(__U, __A, __B, 63);
 }
 
 __m256i test_mm256_shldi_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.256
+  // CHECK: @llvm.x86.avx512.vpshld.q.256
   return _mm256_shldi_epi64(__A, __B, 31);
 }
 
-__m128i test_mm128_mask_shldi_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.128
-  return _mm128_mask_shldi_epi64(__S, __U, __A, __B, 127);
+__m128i test_mm_mask_shldi_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shldi_epi64
+  // CHECK: @llvm.x86.avx512.vpshld.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_shldi_epi64(__S, __U, __A, __B, 127);
 }
 
-__m128i test_mm128_maskz_shldi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.128
-  return _mm128_maskz_shldi_epi64(__U, __A, __B, 63);
+__m128i test_mm_maskz_shldi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shldi_epi64
+  // CHECK: @llvm.x86.avx512.vpshld.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_shldi_epi64(__U, __A, __B, 63);
 }
 
-__m128i test_mm128_shldi_epi64(__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shldi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshld.q.128
-  return _mm128_shldi_epi64(__A, __B, 31);
+__m128i test_mm_shldi_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shldi_epi64
+  // CHECK: @llvm.x86.avx512.vpshld.q.128
+  return _mm_shldi_epi64(__A, __B, 31);
 }
 
 __m256i test_mm256_mask_shldi_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.256
+  // CHECK: @llvm.x86.avx512.vpshld.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shldi_epi32(__S, __U, __A, __B, 127);
 }
 
 __m256i test_mm256_maskz_shldi_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.256
+  // CHECK: @llvm.x86.avx512.vpshld.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shldi_epi32(__U, __A, __B, 63);
 }
 
 __m256i test_mm256_shldi_epi32(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.256
+  // CHECK: @llvm.x86.avx512.vpshld.d.256
   return _mm256_shldi_epi32(__A, __B, 31);
 }
 
-__m128i test_mm128_mask_shldi_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.128
-  return _mm128_mask_shldi_epi32(__S, __U, __A, __B, 127);
+__m128i test_mm_mask_shldi_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shldi_epi32
+  // CHECK: @llvm.x86.avx512.vpshld.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_shldi_epi32(__S, __U, __A, __B, 127);
 }
 
-__m128i test_mm128_maskz_shldi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.128
-  return _mm128_maskz_shldi_epi32(__U, __A, __B, 63);
+__m128i test_mm_maskz_shldi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shldi_epi32
+  // CHECK: @llvm.x86.avx512.vpshld.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_shldi_epi32(__U, __A, __B, 63);
 }
 
-__m128i test_mm128_shldi_epi32(__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shldi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshld.d.128
-  return _mm128_shldi_epi32(__A, __B, 31);
+__m128i test_mm_shldi_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shldi_epi32
+  // CHECK: @llvm.x86.avx512.vpshld.d.128
+  return _mm_shldi_epi32(__A, __B, 31);
 }
 
 __m256i test_mm256_mask_shldi_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.256
+  // CHECK: @llvm.x86.avx512.vpshld.w.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_shldi_epi16(__S, __U, __A, __B, 127);
 }
 
 __m256i test_mm256_maskz_shldi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.256
+  // CHECK: @llvm.x86.avx512.vpshld.w.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_shldi_epi16(__U, __A, __B, 63);
 }
 
 __m256i test_mm256_shldi_epi16(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.256
+  // CHECK: @llvm.x86.avx512.vpshld.w.256
   return _mm256_shldi_epi16(__A, __B, 31);
 }
 
-__m128i test_mm128_mask_shldi_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.128
-  return _mm128_mask_shldi_epi16(__S, __U, __A, __B, 127);
+__m128i test_mm_mask_shldi_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shldi_epi16
+  // CHECK: @llvm.x86.avx512.vpshld.w.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_shldi_epi16(__S, __U, __A, __B, 127);
 }
 
-__m128i test_mm128_maskz_shldi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.128
-  return _mm128_maskz_shldi_epi16(__U, __A, __B, 63);
+__m128i test_mm_maskz_shldi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shldi_epi16
+  // CHECK: @llvm.x86.avx512.vpshld.w.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_shldi_epi16(__U, __A, __B, 63);
 }
 
-__m128i test_mm128_shldi_epi16(__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shldi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshld.w.128
-  return _mm128_shldi_epi16(__A, __B, 31);
+__m128i test_mm_shldi_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shldi_epi16
+  // CHECK: @llvm.x86.avx512.vpshld.w.128
+  return _mm_shldi_epi16(__A, __B, 31);
 }
 
 __m256i test_mm256_mask_shrdi_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.256
+  // CHECK: @llvm.x86.avx512.vpshrd.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_shrdi_epi64(__S, __U, __A, __B, 127);
 }
 
 __m256i test_mm256_maskz_shrdi_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.256
+  // CHECK: @llvm.x86.avx512.vpshrd.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_shrdi_epi64(__U, __A, __B, 63);
 }
 
 __m256i test_mm256_shrdi_epi64(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.256
+  // CHECK: @llvm.x86.avx512.vpshrd.q.256
   return _mm256_shrdi_epi64(__A, __B, 31);
 }
 
-__m128i test_mm128_mask_shrdi_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.128
-  return _mm128_mask_shrdi_epi64(__S, __U, __A, __B, 127);
+__m128i test_mm_mask_shrdi_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shrdi_epi64
+  // CHECK: @llvm.x86.avx512.vpshrd.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_shrdi_epi64(__S, __U, __A, __B, 127);
 }
 
-__m128i test_mm128_maskz_shrdi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.128
-  return _mm128_maskz_shrdi_epi64(__U, __A, __B, 63);
+__m128i test_mm_maskz_shrdi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shrdi_epi64
+  // CHECK: @llvm.x86.avx512.vpshrd.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_shrdi_epi64(__U, __A, __B, 63);
 }
 
-__m128i test_mm128_shrdi_epi64(__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shrdi_epi64
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.q.128
-  return _mm128_shrdi_epi64(__A, __B, 31);
+__m128i test_mm_shrdi_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shrdi_epi64
+  // CHECK: @llvm.x86.avx512.vpshrd.q.128
+  return _mm_shrdi_epi64(__A, __B, 31);
 }
 
 __m256i test_mm256_mask_shrdi_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.256
+  // CHECK: @llvm.x86.avx512.vpshrd.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shrdi_epi32(__S, __U, __A, __B, 127);
 }
 
 __m256i test_mm256_maskz_shrdi_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.256
+  // CHECK: @llvm.x86.avx512.vpshrd.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shrdi_epi32(__U, __A, __B, 63);
 }
 
 __m256i test_mm256_shrdi_epi32(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.256
+  // CHECK: @llvm.x86.avx512.vpshrd.d.256
   return _mm256_shrdi_epi32(__A, __B, 31);
 }
 
-__m128i test_mm128_mask_shrdi_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.128
-  return _mm128_mask_shrdi_epi32(__S, __U, __A, __B, 127);
+__m128i test_mm_mask_shrdi_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shrdi_epi32
+  // CHECK: @llvm.x86.avx512.vpshrd.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_shrdi_epi32(__S, __U, __A, __B, 127);
 }
 
-__m128i test_mm128_maskz_shrdi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.128
-  return _mm128_maskz_shrdi_epi32(__U, __A, __B, 63);
+__m128i test_mm_maskz_shrdi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shrdi_epi32
+  // CHECK: @llvm.x86.avx512.vpshrd.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_shrdi_epi32(__U, __A, __B, 63);
 }
 
-__m128i test_mm128_shrdi_epi32(__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shrdi_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.d.128
-  return _mm128_shrdi_epi32(__A, __B, 31);
+__m128i test_mm_shrdi_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shrdi_epi32
+  // CHECK: @llvm.x86.avx512.vpshrd.d.128
+  return _mm_shrdi_epi32(__A, __B, 31);
 }
 
 __m256i test_mm256_mask_shrdi_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.256
+  // CHECK: @llvm.x86.avx512.vpshrd.w.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_shrdi_epi16(__S, __U, __A, __B, 127);
 }
 
 __m256i test_mm256_maskz_shrdi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.256
+  // CHECK: @llvm.x86.avx512.vpshrd.w.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_shrdi_epi16(__U, __A, __B, 63);
 }
 
 __m256i test_mm256_shrdi_epi16(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.256
+  // CHECK: @llvm.x86.avx512.vpshrd.w.256
   return _mm256_shrdi_epi16(__A, __B, 31);
 }
 
-__m128i test_mm128_mask_shrdi_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.128
-  return _mm128_mask_shrdi_epi16(__S, __U, __A, __B, 127);
+__m128i test_mm_mask_shrdi_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shrdi_epi16
+  // CHECK: @llvm.x86.avx512.vpshrd.w.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_shrdi_epi16(__S, __U, __A, __B, 127);
 }
 
-__m128i test_mm128_maskz_shrdi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.128
-  return _mm128_maskz_shrdi_epi16(__U, __A, __B, 63);
+__m128i test_mm_maskz_shrdi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shrdi_epi16
+  // CHECK: @llvm.x86.avx512.vpshrd.w.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_shrdi_epi16(__U, __A, __B, 63);
 }
 
-__m128i test_mm128_shrdi_epi16(__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shrdi_epi16
-  // CHECK: @llvm.x86.avx512.mask.vpshrd.w.128
-  return _mm128_shrdi_epi16(__A, __B, 31);
+__m128i test_mm_shrdi_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shrdi_epi16
+  // CHECK: @llvm.x86.avx512.vpshrd.w.128
+  return _mm_shrdi_epi16(__A, __B, 31);
 }
 
 __m256i test_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
@@ -404,22 +428,22 @@ __m256i test_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B) {
   return _mm256_shldv_epi64(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shldv_epi64
+__m128i test_mm_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shldv_epi64
   // CHECK: @llvm.x86.avx512.mask.vpshldv.q.128
-  return _mm128_mask_shldv_epi64(__S, __U, __A, __B);
+  return _mm_mask_shldv_epi64(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shldv_epi64
+__m128i test_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shldv_epi64
   // CHECK: @llvm.x86.avx512.maskz.vpshldv.q.128
-  return _mm128_maskz_shldv_epi64(__U, __S, __A, __B);
+  return _mm_maskz_shldv_epi64(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shldv_epi64
+__m128i test_mm_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shldv_epi64
   // CHECK: @llvm.x86.avx512.mask.vpshldv.q.128
-  return _mm128_shldv_epi64(__S, __A, __B);
+  return _mm_shldv_epi64(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
@@ -440,22 +464,22 @@ __m256i test_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B) {
   return _mm256_shldv_epi32(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shldv_epi32
+__m128i test_mm_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shldv_epi32
   // CHECK: @llvm.x86.avx512.mask.vpshldv.d.128
-  return _mm128_mask_shldv_epi32(__S, __U, __A, __B);
+  return _mm_mask_shldv_epi32(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shldv_epi32
+__m128i test_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shldv_epi32
   // CHECK: @llvm.x86.avx512.maskz.vpshldv.d.128
-  return _mm128_maskz_shldv_epi32(__U, __S, __A, __B);
+  return _mm_maskz_shldv_epi32(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shldv_epi32
+__m128i test_mm_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shldv_epi32
   // CHECK: @llvm.x86.avx512.mask.vpshldv.d.128
-  return _mm128_shldv_epi32(__S, __A, __B);
+  return _mm_shldv_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) {
@@ -476,22 +500,22 @@ __m256i test_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B) {
   return _mm256_shldv_epi16(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shldv_epi16
+__m128i test_mm_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shldv_epi16
   // CHECK: @llvm.x86.avx512.mask.vpshldv.w.128
-  return _mm128_mask_shldv_epi16(__S, __U, __A, __B);
+  return _mm_mask_shldv_epi16(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shldv_epi16
+__m128i test_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shldv_epi16
   // CHECK: @llvm.x86.avx512.maskz.vpshldv.w.128
-  return _mm128_maskz_shldv_epi16(__U, __S, __A, __B);
+  return _mm_maskz_shldv_epi16(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shldv_epi16
+__m128i test_mm_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shldv_epi16
   // CHECK: @llvm.x86.avx512.mask.vpshldv.w.128
-  return _mm128_shldv_epi16(__S, __A, __B);
+  return _mm_shldv_epi16(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
@@ -512,22 +536,22 @@ __m256i test_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B) {
   return _mm256_shrdv_epi64(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shrdv_epi64
+__m128i test_mm_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shrdv_epi64
   // CHECK: @llvm.x86.avx512.mask.vpshrdv.q.128
-  return _mm128_mask_shrdv_epi64(__S, __U, __A, __B);
+  return _mm_mask_shrdv_epi64(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shrdv_epi64
+__m128i test_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shrdv_epi64
   // CHECK: @llvm.x86.avx512.maskz.vpshrdv.q.128
-  return _mm128_maskz_shrdv_epi64(__U, __S, __A, __B);
+  return _mm_maskz_shrdv_epi64(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shrdv_epi64
+__m128i test_mm_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shrdv_epi64
   // CHECK: @llvm.x86.avx512.mask.vpshrdv.q.128
-  return _mm128_shrdv_epi64(__S, __A, __B);
+  return _mm_shrdv_epi64(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
@@ -548,22 +572,22 @@ __m256i test_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B) {
   return _mm256_shrdv_epi32(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shrdv_epi32
+__m128i test_mm_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shrdv_epi32
   // CHECK: @llvm.x86.avx512.mask.vpshrdv.d.128
-  return _mm128_mask_shrdv_epi32(__S, __U, __A, __B);
+  return _mm_mask_shrdv_epi32(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shrdv_epi32
+__m128i test_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shrdv_epi32
   // CHECK: @llvm.x86.avx512.maskz.vpshrdv.d.128
-  return _mm128_maskz_shrdv_epi32(__U, __S, __A, __B);
+  return _mm_maskz_shrdv_epi32(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shrdv_epi32
+__m128i test_mm_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shrdv_epi32
   // CHECK: @llvm.x86.avx512.mask.vpshrdv.d.128
-  return _mm128_shrdv_epi32(__S, __A, __B);
+  return _mm_shrdv_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) {
@@ -584,21 +608,21 @@ __m256i test_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B) {
   return _mm256_shrdv_epi16(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_shrdv_epi16
+__m128i test_mm_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_shrdv_epi16
   // CHECK: @llvm.x86.avx512.mask.vpshrdv.w.128
-  return _mm128_mask_shrdv_epi16(__S, __U, __A, __B);
+  return _mm_mask_shrdv_epi16(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_shrdv_epi16
+__m128i test_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_shrdv_epi16
   // CHECK: @llvm.x86.avx512.maskz.vpshrdv.w.128
-  return _mm128_maskz_shrdv_epi16(__U, __S, __A, __B);
+  return _mm_maskz_shrdv_epi16(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_shrdv_epi16
+__m128i test_mm_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_shrdv_epi16
   // CHECK: @llvm.x86.avx512.mask.vpshrdv.w.128
-  return _mm128_shrdv_epi16(__S, __A, __B);
+  return _mm_shrdv_epi16(__S, __A, __B);
 }
 
diff --git a/test/CodeGen/avx512vlvnni-builtins.c b/test/CodeGen/avx512vlvnni-builtins.c
index 861b915fdb29..b742019cb6ee 100644
--- a/test/CodeGen/avx512vlvnni-builtins.c
+++ b/test/CodeGen/avx512vlvnni-builtins.c
@@ -4,145 +4,161 @@
 
 __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusd.256
+  // CHECK: @llvm.x86.avx512.vpdpbusd.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpbusd_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpbusd.256
+  // CHECK: @llvm.x86.avx512.vpdpbusd.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpbusd_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusd.256
+  // CHECK: @llvm.x86.avx512.vpdpbusd.256
   return _mm256_dpbusd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusds.256
+  // CHECK: @llvm.x86.avx512.vpdpbusds.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpbusds.256
+  // CHECK: @llvm.x86.avx512.vpdpbusds.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusds.256
+  // CHECK: @llvm.x86.avx512.vpdpbusds.256
   return _mm256_dpbusds_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssd.256
+  // CHECK: @llvm.x86.avx512.vpdpwssd.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpwssd.256
+  // CHECK: @llvm.x86.avx512.vpdpwssd.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssd.256
+  // CHECK: @llvm.x86.avx512.vpdpwssd.256
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssds.256
+  // CHECK: @llvm.x86.avx512.vpdpwssds.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpwssds.256
+  // CHECK: @llvm.x86.avx512.vpdpwssds.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssds.256
+  // CHECK: @llvm.x86.avx512.vpdpwssds.256
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusd.128
-  return _mm128_mask_dpbusd_epi32(__S, __U, __A, __B);
+__m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_dpbusd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbusd_epi32(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpbusd.128
-  return _mm128_maskz_dpbusd_epi32(__U, __S, __A, __B);
+__m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_dpbusd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbusd_epi32(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusd.128
-  return _mm128_dpbusd_epi32(__S, __A, __B);
+__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.128
+  return _mm_dpbusd_epi32(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusds.128
-  return _mm128_mask_dpbusds_epi32(__S, __U, __A, __B);
+__m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_dpbusds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpbusds.128
-  return _mm128_maskz_dpbusds_epi32(__U, __S, __A, __B);
+__m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_dpbusds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusds.128
-  return _mm128_dpbusds_epi32(__S, __A, __B);
+__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.128
+  return _mm_dpbusds_epi32(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssd.128
-  return _mm128_mask_dpwssd_epi32(__S, __U, __A, __B);
+__m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_dpwssd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpwssd.128
-  return _mm128_maskz_dpwssd_epi32(__U, __S, __A, __B);
+__m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_dpwssd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssd.128
-  return _mm128_dpwssd_epi32(__S, __A, __B);
+__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.128
+  return _mm_dpwssd_epi32(__S, __A, __B);
 }
 
-__m128i test_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_mask_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssds.128
-  return _mm128_mask_dpwssds_epi32(__S, __U, __A, __B);
+__m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_dpwssds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
-__m128i test_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_maskz_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpwssds.128
-  return _mm128_maskz_dpwssds_epi32(__U, __S, __A, __B);
+__m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_dpwssds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
-__m128i test_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm128_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssds.128
-  return _mm128_dpwssds_epi32(__S, __A, __B);
+__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.128
+  return _mm_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/test/CodeGen/avx512vnni-builtins.c b/test/CodeGen/avx512vnni-builtins.c
index d79046aa0499..f03c8eaa8261 100644
--- a/test/CodeGen/avx512vnni-builtins.c
+++ b/test/CodeGen/avx512vnni-builtins.c
@@ -4,73 +4,81 @@
 
 __m512i test_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusd.512
+  // CHECK: @llvm.x86.avx512.vpdpbusd.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpbusd_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpbusd.512
+  // CHECK: @llvm.x86.avx512.vpdpbusd.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpbusd_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_dpbusd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusd.512
+  // CHECK: @llvm.x86.avx512.vpdpbusd.512
   return _mm512_dpbusd_epi32(__S, __A, __B);
 }
 
 __m512i test_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusds.512
+  // CHECK: @llvm.x86.avx512.vpdpbusds.51
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpbusds.512
+  // CHECK: @llvm.x86.avx512.vpdpbusds.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_dpbusds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpbusds.512
+  // CHECK: @llvm.x86.avx512.vpdpbusds.512
   return _mm512_dpbusds_epi32(__S, __A, __B);
 }
 
 __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssd.512
+  // CHECK: @llvm.x86.avx512.vpdpwssd.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpwssd.512
+  // CHECK: @llvm.x86.avx512.vpdpwssd.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_dpwssd_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssd.512
+  // CHECK: @llvm.x86.avx512.vpdpwssd.512
   return _mm512_dpwssd_epi32(__S, __A, __B);
 }
 
 __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssds.512
+  // CHECK: @llvm.x86.avx512.vpdpwssds.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.maskz.vpdpwssds.512
+  // CHECK: @llvm.x86.avx512.vpdpwssds.512
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_dpwssds_epi32
-  // CHECK: @llvm.x86.avx512.mask.vpdpwssds.512
+  // CHECK: @llvm.x86.avx512.vpdpwssds.512
   return _mm512_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/test/CodeGen/backend-unsupported-error.ll b/test/CodeGen/backend-unsupported-error.ll
index 1a15bfc74dfb..75a5992f2353 100644
--- a/test/CodeGen/backend-unsupported-error.ll
+++ b/test/CodeGen/backend-unsupported-error.ll
@@ -3,7 +3,7 @@
 
 ; This is to check that backend errors for unsupported features are formatted correctly
 
-; CHECK: error: test.c:2:20: in function bar i32 (): unsupported call to function foo.2
+; CHECK: error: test.c:2:20: in function bar i32 (): unsupported call to function foo
 
 target triple = "r600-unknown-unknown"
 
@@ -21,7 +21,7 @@ entry:
   ret i32 %call, !dbg !15
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind noinline "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
@@ -30,11 +30,11 @@ attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fp
 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2)
 !1 = !DIFile(filename: "test.c", directory: "")
 !2 = !{}
-!4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, variables: !2)
+!4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, retainedNodes: !2)
 !5 = !DISubroutineType(types: !6)
 !6 = !{!7}
 !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
 !9 = !{i32 2, !"Dwarf Version", i32 4}
 !10 = !{i32 2, !"Debug Info Version", i32 3}
 !11 = !{!"clang version 3.9.0"}
diff --git a/test/CodeGen/bitscan-builtins.c b/test/CodeGen/bitscan-builtins.c
index 5fd3f13fbc73..25dfa4046205 100644
--- a/test/CodeGen/bitscan-builtins.c
+++ b/test/CodeGen/bitscan-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 // PR33722
-// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -D_MSC_VER -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -fms-extensions -fms-compatibility-version=19.00 -emit-llvm -o - %s | FileCheck %s
 
 #include <immintrin.h>
 
diff --git a/test/CodeGen/bittest-intrin.c b/test/CodeGen/bittest-intrin.c
new file mode 100644
index 000000000000..f89a8267b0c7
--- /dev/null
+++ b/test/CodeGen/bittest-intrin.c
@@ -0,0 +1,135 @@
+// RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc %s -emit-llvm -o - | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -fms-extensions -triple thumbv7-windows-msvc %s -emit-llvm -o - | FileCheck %s --check-prefix=ARM
+// RUN: %clang_cc1 -fms-extensions -triple aarch64-windows-msvc %s -emit-llvm -o - | FileCheck %s --check-prefix=ARM
+
+volatile unsigned char sink = 0;
+void test32(long *base, long idx) {
+  sink = _bittest(base, idx);
+  sink = _bittestandcomplement(base, idx);
+  sink = _bittestandreset(base, idx);
+  sink = _bittestandset(base, idx);
+  sink = _interlockedbittestandreset(base, idx);
+  sink = _interlockedbittestandset(base, idx);
+  sink = _interlockedbittestandset(base, idx);
+}
+
+void test64(__int64 *base, __int64 idx) {
+  sink = _bittest64(base, idx);
+  sink = _bittestandcomplement64(base, idx);
+  sink = _bittestandreset64(base, idx);
+  sink = _bittestandset64(base, idx);
+  sink = _interlockedbittestandreset64(base, idx);
+  sink = _interlockedbittestandset64(base, idx);
+}
+
+#if defined(_M_ARM) || defined(_M_ARM64)
+void test_arm(long *base, long idx) {
+  sink = _interlockedbittestandreset_acq(base, idx);
+  sink = _interlockedbittestandreset_rel(base, idx);
+  sink = _interlockedbittestandreset_nf(base, idx);
+  sink = _interlockedbittestandset_acq(base, idx);
+  sink = _interlockedbittestandset_rel(base, idx);
+  sink = _interlockedbittestandset_nf(base, idx);
+}
+#endif
+
+// X64-LABEL: define dso_local void @test32(i32* %base, i32 %idx)
+// X64: call i8 asm sideeffect "btl $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i32* %{{.*}}, i32 {{.*}})
+// X64: call i8 asm sideeffect "btcl $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i32* %{{.*}}, i32 {{.*}})
+// X64: call i8 asm sideeffect "btrl $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i32* %{{.*}}, i32 {{.*}})
+// X64: call i8 asm sideeffect "btsl $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i32* %{{.*}}, i32 {{.*}})
+// X64: call i8 asm sideeffect "lock btrl $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i32* %{{.*}}, i32 {{.*}})
+// X64: call i8 asm sideeffect "lock btsl $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i32* %{{.*}}, i32 {{.*}})
+
+// X64-LABEL: define dso_local void @test64(i64* %base, i64 %idx)
+// X64: call i8 asm sideeffect "btq $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i64* %{{.*}}, i64 {{.*}})
+// X64: call i8 asm sideeffect "btcq $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i64* %{{.*}}, i64 {{.*}})
+// X64: call i8 asm sideeffect "btrq $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i64* %{{.*}}, i64 {{.*}})
+// X64: call i8 asm sideeffect "btsq $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i64* %{{.*}}, i64 {{.*}})
+// X64: call i8 asm sideeffect "lock btrq $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i64* %{{.*}}, i64 {{.*}})
+// X64: call i8 asm sideeffect "lock btsq $2, ($1)\0A\09setc ${0:b}", "=r,r,r,~{{.*}}"(i64* %{{.*}}, i64 {{.*}})
+
+// ARM-LABEL: define dso_local {{.*}}void @test32(i32* %base, i32 %idx)
+// ARM: %[[IDXHI:[^ ]*]] = ashr i32 %{{.*}}, 3
+// ARM: %[[BASE:[^ ]*]] = bitcast i32* %{{.*}} to i8*
+// ARM: %[[BYTEADDR:[^ ]*]] = getelementptr inbounds i8, i8* %[[BASE]], i32 %[[IDXHI]]
+// ARM: %[[IDX8:[^ ]*]] = trunc i32 %{{.*}} to i8
+// ARM: %[[IDXLO:[^ ]*]] = and i8 %[[IDX8]], 7
+// ARM: %[[BYTE:[^ ]*]] = load i8, i8* %[[BYTEADDR]], align 1
+// ARM: %[[BYTESHR:[^ ]*]] = lshr i8 %[[BYTE]], %[[IDXLO]]
+// ARM: %[[RES:[^ ]*]] = and i8 %[[BYTESHR]], 1
+// ARM: store volatile i8 %[[RES]], i8* @sink, align 1
+
+// ARM: %[[IDXHI:[^ ]*]] = ashr i32 %{{.*}}, 3
+// ARM: %[[BASE:[^ ]*]] = bitcast i32* %{{.*}} to i8*
+// ARM: %[[BYTEADDR:[^ ]*]] = getelementptr inbounds i8, i8* %[[BASE]], i32 %[[IDXHI]]
+// ARM: %[[IDX8:[^ ]*]] = trunc i32 %{{.*}} to i8
+// ARM: %[[IDXLO:[^ ]*]] = and i8 %[[IDX8]], 7
+// ARM: %[[MASK:[^ ]*]] = shl i8 1, %[[IDXLO]]
+// ARM: %[[BYTE:[^ ]*]] = load i8, i8* %[[BYTEADDR]], align 1
+// ARM: %[[NEWBYTE:[^ ]*]] = xor i8 %[[BYTE]], %[[MASK]]
+// ARM  store i8 %[[NEWBYTE]], i8* %[[BYTEADDR]], align 1
+// ARM: %[[BYTESHR:[^ ]*]] = lshr i8 %[[BYTE]], %[[IDXLO]]
+// ARM: %[[RES:[^ ]*]] = and i8 %[[BYTESHR]], 1
+// ARM: store volatile i8 %[[RES]], i8* @sink, align 1
+
+// ARM: %[[IDXHI:[^ ]*]] = ashr i32 %{{.*}}, 3
+// ARM: %[[BASE:[^ ]*]] = bitcast i32* %{{.*}} to i8*
+// ARM: %[[BYTEADDR:[^ ]*]] = getelementptr inbounds i8, i8* %[[BASE]], i32 %[[IDXHI]]
+// ARM: %[[IDX8:[^ ]*]] = trunc i32 %{{.*}} to i8
+// ARM: %[[IDXLO:[^ ]*]] = and i8 %[[IDX8]], 7
+// ARM: %[[MASK:[^ ]*]] = shl i8 1, %[[IDXLO]]
+// ARM: %[[BYTE:[^ ]*]] = load i8, i8* %[[BYTEADDR]], align 1
+// ARM: %[[NOTMASK:[^ ]*]] = xor i8 %[[MASK]], -1
+// ARM: %[[NEWBYTE:[^ ]*]] = and i8 %[[BYTE]], %[[NOTMASK]]
+// ARM  store i8 %[[NEWBYTE]], i8* %[[BYTEADDR]], align 1
+// ARM: %[[BYTESHR:[^ ]*]] = lshr i8 %[[BYTE]], %[[IDXLO]]
+// ARM: %[[RES:[^ ]*]] = and i8 %[[BYTESHR]], 1
+// ARM: store volatile i8 %[[RES]], i8* @sink, align 1
+
+// ARM: %[[IDXHI:[^ ]*]] = ashr i32 %{{.*}}, 3
+// ARM: %[[BASE:[^ ]*]] = bitcast i32* %{{.*}} to i8*
+// ARM: %[[BYTEADDR:[^ ]*]] = getelementptr inbounds i8, i8* %[[BASE]], i32 %[[IDXHI]]
+// ARM: %[[IDX8:[^ ]*]] = trunc i32 %{{.*}} to i8
+// ARM: %[[IDXLO:[^ ]*]] = and i8 %[[IDX8]], 7
+// ARM: %[[MASK:[^ ]*]] = shl i8 1, %[[IDXLO]]
+// ARM: %[[BYTE:[^ ]*]] = load i8, i8* %[[BYTEADDR]], align 1
+// ARM: %[[NEWBYTE:[^ ]*]] = or i8 %[[BYTE]], %[[MASK]]
+// ARM  store i8 %[[NEWBYTE]], i8* %[[BYTEADDR]], align 1
+// ARM: %[[BYTESHR:[^ ]*]] = lshr i8 %[[BYTE]], %[[IDXLO]]
+// ARM: %[[RES:[^ ]*]] = and i8 %[[BYTESHR]], 1
+// ARM: store volatile i8 %[[RES]], i8* @sink, align 1
+
+// ARM: %[[IDXHI:[^ ]*]] = ashr i32 %{{.*}}, 3
+// ARM: %[[BASE:[^ ]*]] = bitcast i32* %{{.*}} to i8*
+// ARM: %[[BYTEADDR:[^ ]*]] = getelementptr inbounds i8, i8* %[[BASE]], i32 %[[IDXHI]]
+// ARM: %[[IDX8:[^ ]*]] = trunc i32 %{{.*}} to i8
+// ARM: %[[IDXLO:[^ ]*]] = and i8 %[[IDX8]], 7
+// ARM: %[[MASK:[^ ]*]] = shl i8 1, %[[IDXLO]]
+// ARM: %[[NOTMASK:[^ ]*]] = xor i8 %[[MASK]], -1
+// ARM: %[[BYTE:[^ ]*]] = atomicrmw and i8* %[[BYTEADDR]], i8 %[[NOTMASK]] seq_cst
+// ARM: %[[BYTESHR:[^ ]*]] = lshr i8 %[[BYTE]], %[[IDXLO]]
+// ARM: %[[RES:[^ ]*]] = and i8 %[[BYTESHR]], 1
+// ARM: store volatile i8 %[[RES]], i8* @sink, align 1
+
+// ARM: %[[IDXHI:[^ ]*]] = ashr i32 %{{.*}}, 3
+// ARM: %[[BASE:[^ ]*]] = bitcast i32* %{{.*}} to i8*
+// ARM: %[[BYTEADDR:[^ ]*]] = getelementptr inbounds i8, i8* %[[BASE]], i32 %[[IDXHI]]
+// ARM: %[[IDX8:[^ ]*]] = trunc i32 %{{.*}} to i8
+// ARM: %[[IDXLO:[^ ]*]] = and i8 %[[IDX8]], 7
+// ARM: %[[MASK:[^ ]*]] = shl i8 1, %[[IDXLO]]
+// ARM: %[[BYTE:[^ ]*]] = atomicrmw or i8* %[[BYTEADDR]], i8 %[[MASK]] seq_cst
+// ARM: %[[BYTESHR:[^ ]*]] = lshr i8 %[[BYTE]], %[[IDXLO]]
+// ARM: %[[RES:[^ ]*]] = and i8 %[[BYTESHR]], 1
+// ARM: store volatile i8 %[[RES]], i8* @sink, align 1
+
+
+// Just look for the atomicrmw instructions.
+
+// ARM-LABEL: define dso_local {{.*}}void @test_arm(i32* %base, i32 %idx)
+// ARM: atomicrmw and i8* %{{.*}}, i8 {{.*}} acquire
+// ARM: atomicrmw and i8* %{{.*}}, i8 {{.*}} release
+// ARM: atomicrmw and i8* %{{.*}}, i8 {{.*}} monotonic
+// ARM: atomicrmw or i8* %{{.*}}, i8 {{.*}} acquire
+// ARM: atomicrmw or i8* %{{.*}}, i8 {{.*}} release
+// ARM: atomicrmw or i8* %{{.*}}, i8 {{.*}} monotonic
diff --git a/test/CodeGen/block-byref-aggr.c b/test/CodeGen/block-byref-aggr.c
index 7d146a2d4777..962f100638f0 100644
--- a/test/CodeGen/block-byref-aggr.c
+++ b/test/CodeGen/block-byref-aggr.c
@@ -24,7 +24,7 @@ void test0() {
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[BYREF]], [[BYREF]]* [[T0]], i32 0, i32 4
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
 //   Verify that there's nothing else significant in the function.
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[BYREF]]* [[A]] to i8*
 // CHECK-NEXT: call void @_Block_object_dispose(i8* [[T0]], i32 8)
@@ -50,14 +50,14 @@ void test1() {
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[B_BYREF]], [[B_BYREF]]* [[T0]], i32 0, i32 4
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
 //   Then for 'a':
 // CHECK-NEXT: [[A_FORWARDING:%.*]] = getelementptr inbounds [[A_BYREF]], [[A_BYREF]]* [[A]], i32 0, i32 1
 // CHECK-NEXT: [[T0:%.*]] = load [[A_BYREF]]*, [[A_BYREF]]** [[A_FORWARDING]]
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[A_BYREF]], [[A_BYREF]]* [[T0]], i32 0, i32 4
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
 //   Verify that there's nothing else significant in the function.
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[B_BYREF]]* [[B]] to i8*
 // CHECK-NEXT: call void @_Block_object_dispose(i8* [[T0]], i32 8)
diff --git a/test/CodeGen/blocks-windows.c b/test/CodeGen/blocks-windows.c
index ced00ef015fc..2dd97ffdd2f3 100644
--- a/test/CodeGen/blocks-windows.c
+++ b/test/CodeGen/blocks-windows.c
@@ -67,8 +67,8 @@ int (*g(void))(void) {
   return _Block_copy(^{ ++i; return i; });
 }
 
-// CHECK-BLOCKS-IN-BLOCKS-DECL: @_NSConcreteStackBlock = external dllexport global i8*
-// CHECK-BLOCKS-IN-BLOCKS-DEFN: @_NSConcreteStackBlock = common dllexport global [5 x i32]
+// CHECK-BLOCKS-IN-BLOCKS-DECL: @_NSConcreteStackBlock = external dso_local dllexport global i8*
+// CHECK-BLOCKS-IN-BLOCKS-DEFN: @_NSConcreteStackBlock = common dso_local dllexport global [5 x i32]
 // CHECK-BLOCKS-NOT-IN-BLOCKS: @_NSConcreteStackBlock = external dllimport global i8*
 // CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN: @_NSConcreteStackBlock = external dllimport global i8*
 // CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT: @_NSConcreteStackBlock = external dllimport global i8*
diff --git a/test/CodeGen/bmi-builtins.c b/test/CodeGen/bmi-builtins.c
index f78e3fdd4a17..91271f0b3599 100644
--- a/test/CodeGen/bmi-builtins.c
+++ b/test/CodeGen/bmi-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
 
diff --git a/test/CodeGen/bmi2-builtins.c b/test/CodeGen/bmi2-builtins.c
index 3a5d5e756ddb..9cce92474c85 100644
--- a/test/CodeGen/bmi2-builtins.c
+++ b/test/CodeGen/bmi2-builtins.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 unsigned int test_bzhi_u32(unsigned int __X, unsigned int __Y) {
   // CHECK: @llvm.x86.bmi.bzhi.32
diff --git a/test/CodeGen/builtin-abs.c b/test/CodeGen/builtin-abs.c
new file mode 100644
index 000000000000..791395368a8c
--- /dev/null
+++ b/test/CodeGen/builtin-abs.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+int absi(int x) {
+// CHECK-LABEL: @absi(
+// CHECK:   [[NEG:%.*]] = sub nsw i32 0, [[X:%.*]]
+// CHECK:   [[CMP:%.*]] = icmp slt i32 [[X]], 0
+// CHECK:   [[SEL:%.*]] = select i1 [[CMP]], i32 [[NEG]], i32 [[X]]
+//
+  return __builtin_abs(x);
+}
+
+long absl(long x) {
+// CHECK-LABEL: @absl(
+// CHECK:   [[NEG:%.*]] = sub nsw i64 0, [[X:%.*]]
+// CHECK:   [[CMP:%.*]] = icmp slt i64 [[X]], 0
+// CHECK:   [[SEL:%.*]] = select i1 [[CMP]], i64 [[NEG]], i64 [[X]]
+//
+  return __builtin_labs(x);
+}
+
+long long absll(long long x) {
+// CHECK-LABEL: @absll(
+// CHECK:   [[NEG:%.*]] = sub nsw i64 0, [[X:%.*]]
+// CHECK:   [[CMP:%.*]] = icmp slt i64 [[X]], 0
+// CHECK:   [[SEL:%.*]] = select i1 [[CMP]], i64 [[NEG]], i64 [[X]]
+//
+  return __builtin_llabs(x);
+}
+
diff --git a/test/CodeGen/builtin-clflushopt.c b/test/CodeGen/builtin-clflushopt.c
index f82ac4638f40..4bf6f7efcc1f 100644
--- a/test/CodeGen/builtin-clflushopt.c
+++ b/test/CodeGen/builtin-clflushopt.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-apple-darwin -target-feature +clflushopt  -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 void test_mm_clflushopt(char * __m) {
   //CHECK-LABEL: @test_mm_clflushopt
diff --git a/test/CodeGen/builtin-clwb.c b/test/CodeGen/builtin-clwb.c
index 96d00a6a7ca1..11c37c17694c 100644
--- a/test/CodeGen/builtin-clwb.c
+++ b/test/CodeGen/builtin-clwb.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-apple-darwin -target-feature +clwb  -emit-llvm -o - -Wall -Werror | FileCheck %s
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 void test_mm_clwb(const void *__m) {
   //CHECK-LABEL: @test_mm_clwb
diff --git a/test/CodeGen/builtin-memfns.c b/test/CodeGen/builtin-memfns.c
index 4a06160ccbc6..d93a5aadae27 100644
--- a/test/CodeGen/builtin-memfns.c
+++ b/test/CodeGen/builtin-memfns.c
@@ -54,14 +54,14 @@ int test6(char *X) {
 int test7(int *p) {
   struct snd_pcm_hw_params_t* hwparams;  // incomplete type.
   
-  // CHECK: call void @llvm.memset{{.*}}256, i32 4, i1 false)
+  // CHECK: call void @llvm.memset{{.*}} align 4 {{.*}}256, i1 false)
   __builtin_memset(p, 0, 256);  // Should be alignment = 4
 
-  // CHECK: call void @llvm.memset{{.*}}256, i32 1, i1 false)
+  // CHECK: call void @llvm.memset{{.*}} align 1 {{.*}}256, i1 false)
   __builtin_memset((char*)p, 0, 256);  // Should be alignment = 1
 
   __builtin_memset(hwparams, 0, 256);  // No crash alignment = 1
-  // CHECK: call void @llvm.memset{{.*}}256, i32 1, i1 false)
+  // CHECK: call void @llvm.memset{{.*}} align 1{{.*}}256, i1 false)
 }
 
 // <rdar://problem/11314941>
@@ -73,13 +73,13 @@ struct PS {
 struct PS ps;
 void test8(int *arg) {
   // CHECK: @test8
-  // CHECK: call void @llvm.memcpy{{.*}} 16, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}} align 4 {{.*}} align 1 {{.*}} 16, i1 false)
   __builtin_memcpy(arg, ps.modes, sizeof(struct PS));
 }
 
 __attribute((aligned(16))) int x[4], y[4];
 void test9() {
   // CHECK: @test9
-  // CHECK: call void @llvm.memcpy{{.*}} 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}} align 16 {{.*}} align 16 {{.*}} 16, i1 false)
   __builtin_memcpy(x, y, sizeof(y));
 }
diff --git a/test/CodeGen/builtin-movdir.c b/test/CodeGen/builtin-movdir.c
new file mode 100644
index 000000000000..958690469a80
--- /dev/null
+++ b/test/CodeGen/builtin-movdir.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -ffreestanding -Wall -pedantic -triple x86_64-unknown-unknown -target-feature +movdiri -target-feature +movdir64b %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_64 --check-prefix=CHECK
+// RUN: %clang_cc1 -ffreestanding -Wall -pedantic -triple i386-unknown-unknown -target-feature +movdiri -target-feature +movdir64b %s -emit-llvm -o - | FileCheck %s --check-prefix=X86 --check-prefix=CHECK
+
+#include <immintrin.h>
+#include <stdint.h>
+
+void test_directstore32(void *dst, uint32_t value) {
+  // CHECK-LABEL: test_directstore32
+  // CHECK: call void @llvm.x86.directstore32
+  _directstoreu_u32(dst, value);
+}
+
+#ifdef __x86_64__
+
+void test_directstore64(void *dst, uint64_t value) {
+  // X86_64-LABEL: test_directstore64
+  // X86_64: call void @llvm.x86.directstore64
+  _directstoreu_u64(dst, value);
+}
+
+#endif
+
+void test_dir64b(void *dst, const void *src) {
+  // CHECK-LABEL: test_dir64b
+  // CHECK: [[PTRINT1:%.+]] = ptrtoint
+  // X86: [[MASKEDPTR1:%.+]] = and i32 [[PTRINT1]], 63
+  // X86: [[MASKCOND1:%.+]] = icmp eq i32 [[MASKEDPTR1]], 0
+  // X86_64: [[MASKEDPTR1:%.+]] = and i64 [[PTRINT1]], 63
+  // X86_64: [[MASKCOND1:%.+]] = icmp eq i64 [[MASKEDPTR1]], 0
+  // CHECK: call void @llvm.x86.movdir64b
+  _movdir64b(dst, src);
+}
+
+// CHECK: declare void @llvm.x86.directstore32(i8*, i32)
+// X86_64: declare void @llvm.x86.directstore64(i8*, i64)
+// CHECK: declare void @llvm.x86.movdir64b(i8*, i8*)
diff --git a/test/CodeGen/builtin-wbinvd.c b/test/CodeGen/builtin-wbinvd.c
new file mode 100644
index 000000000000..fe5ad58ca2f3
--- /dev/null
+++ b/test/CodeGen/builtin-wbinvd.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <x86intrin.h>
+
+void test_wbinvd(void) {
+  //CHECK-LABEL: @test_wbinvd
+  //CHECK: call void @llvm.x86.wbinvd()
+  _wbinvd();
+}
diff --git a/test/CodeGen/builtin-wbnoinvd.c b/test/CodeGen/builtin-wbnoinvd.c
new file mode 100644
index 000000000000..5bea5acf5b09
--- /dev/null
+++ b/test/CodeGen/builtin-wbnoinvd.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +wbnoinvd -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+void test_wbnoinvd(void) {
+  //CHECK-LABEL: @test_wbnoinvd
+  //CHECK: call void @llvm.x86.wbnoinvd()
+  _wbnoinvd();
+}
diff --git a/test/CodeGen/builtins-arm.c b/test/CodeGen/builtins-arm.c
index e04349f1a38a..020f2b4dc528 100644
--- a/test/CodeGen/builtins-arm.c
+++ b/test/CodeGen/builtins-arm.c
@@ -8,69 +8,85 @@ void *f0()
 }
 
 void f1(char *a, char *b) {
+  // CHECK: call {{.*}} @__clear_cache
 	__clear_cache(a,b);
 }
 
-// CHECK: call {{.*}} @__clear_cache
+float test_vcvtrf0(float f) {
+  // CHECK: call float @llvm.arm.vcvtr.f32(float %f)
+  return __builtin_arm_vcvtr_f(f, 0);
+}
+
+float test_vcvtrf1(float f) {
+  // CHECK: call float @llvm.arm.vcvtru.f32(float %f)
+  return __builtin_arm_vcvtr_f(f, 1);
+}
+
+double test_vcvtrd0(double d) {
+  // CHECK: call float @llvm.arm.vcvtr.f64(double %d)
+  return __builtin_arm_vcvtr_d(d, 0);
+}
+
+double test_vcvtrd1(double d) {
+  // call float @llvm.arm.vcvtru.f64(double %d)
+  return __builtin_arm_vcvtr_d(d, 1);
+}
 
 void test_eh_return_data_regno()
 {
+  // CHECK: store volatile i32 0
+  // CHECK: store volatile i32 1
   volatile int res;
-  res = __builtin_eh_return_data_regno(0);  // CHECK: store volatile i32 0
-  res = __builtin_eh_return_data_regno(1);  // CHECK: store volatile i32 1
+  res = __builtin_eh_return_data_regno(0);
+  res = __builtin_eh_return_data_regno(1);
 }
 
 void nop() {
+  // CHECK: call {{.*}} @llvm.arm.hint(i32 0)
   __builtin_arm_nop();
 }
 
-// CHECK: call {{.*}} @llvm.arm.hint(i32 0)
-
 void yield() {
+  // CHECK: call {{.*}} @llvm.arm.hint(i32 1)
   __builtin_arm_yield();
 }
 
-// CHECK: call {{.*}} @llvm.arm.hint(i32 1)
-
 void wfe() {
+  // CHECK: call {{.*}} @llvm.arm.hint(i32 2)
   __builtin_arm_wfe();
 }
 
-// CHECK: call {{.*}} @llvm.arm.hint(i32 2)
-
 void wfi() {
+  // CHECK: call {{.*}} @llvm.arm.hint(i32 3)
   __builtin_arm_wfi();
 }
 
-// CHECK: call {{.*}} @llvm.arm.hint(i32 3)
-
 void sev() {
+  // CHECK: call {{.*}} @llvm.arm.hint(i32 4)
   __builtin_arm_sev();
 }
 
-// CHECK: call {{.*}} @llvm.arm.hint(i32 4)
-
 void sevl() {
+  // CHECK: call {{.*}} @llvm.arm.hint(i32 5)
   __builtin_arm_sevl();
 }
 
-// CHECK: call {{.*}} @llvm.arm.hint(i32 5)
-
 void dbg() {
+  // CHECK: call {{.*}} @llvm.arm.dbg(i32 0)
   __builtin_arm_dbg(0);
 }
 
-// CHECK: call {{.*}} @llvm.arm.dbg(i32 0)
-
 void test_barrier() {
-  __builtin_arm_dmb(1); //CHECK: call {{.*}} @llvm.arm.dmb(i32 1)
-  __builtin_arm_dsb(2); //CHECK: call {{.*}} @llvm.arm.dsb(i32 2)
-  __builtin_arm_isb(3); //CHECK: call {{.*}} @llvm.arm.isb(i32 3)
+  //CHECK: call {{.*}} @llvm.arm.dmb(i32 1)
+  //CHECK: call {{.*}} @llvm.arm.dsb(i32 2)
+  //CHECK: call {{.*}} @llvm.arm.isb(i32 3)
+  __builtin_arm_dmb(1);
+  __builtin_arm_dsb(2);
+  __builtin_arm_isb(3);
 }
 
-// CHECK: call {{.*}} @llvm.bitreverse.i32(i32 %a)
-
 unsigned rbit(unsigned a) {
+  // CHECK: call {{.*}} @llvm.bitreverse.i32(i32 %a)
   return __builtin_arm_rbit(a);
 }
 
diff --git a/test/CodeGen/builtins-hexagon-circ.c b/test/CodeGen/builtins-hexagon-circ.c
new file mode 100644
index 000000000000..dc8f5a693fe3
--- /dev/null
+++ b/test/CodeGen/builtins-hexagon-circ.c
@@ -0,0 +1,156 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -emit-llvm %s -o - | FileCheck %s
+
+// CHECK-LABEL: test1
+// CHECK: @llvm.hexagon.L2.loadrub.pci
+unsigned char test1(int mod, void *start) {
+  unsigned char *base = start;
+  return __builtin_HEXAGON_L2_loadrub_pci(&base, 4, mod, start);
+}
+
+// CHECK-LABEL: test2
+// CHECK: @llvm.hexagon.L2.loadrb.pci
+unsigned char test2(int mod, void *start) {
+  char *base = start;
+  return __builtin_HEXAGON_L2_loadrb_pci(&base, 4, mod, start);
+}
+
+// CHECK-LABEL: test3
+// CHECK: @llvm.hexagon.L2.loadruh.pci
+unsigned short test3(int mod, void *start) {
+  unsigned short *base = start;
+  return __builtin_HEXAGON_L2_loadruh_pci(&base, 4, mod, start);
+}
+
+// CHECK-LABEL: test4
+// CHECK: @llvm.hexagon.L2.loadrh.pci
+short test4(int mod, void *start) {
+  short *base = start;
+  return __builtin_HEXAGON_L2_loadrh_pci(&base, 4, mod, start);
+}
+
+// CHECK-LABEL: test5
+// CHECK: @llvm.hexagon.L2.loadri.pci
+int test5(int mod, void *start) {
+  int *base = start;
+  return __builtin_HEXAGON_L2_loadri_pci(&base, 4, mod, start);
+}
+
+// CHECK-LABEL: test6
+// CHECK: @llvm.hexagon.L2.loadrd.pci
+long long test6(int mod, void *start) {
+  long long *base = start;
+  return __builtin_HEXAGON_L2_loadrd_pci(&base, 8, mod, start);
+}
+
+// CHECK-LABEL: test7
+// CHECK: @llvm.hexagon.L2.loadrub.pcr
+unsigned char test7(int mod, void *start) {
+  unsigned char *base = start;
+  return __builtin_HEXAGON_L2_loadrub_pcr(&base, mod, start);
+}
+
+// CHECK-LABEL: test8
+// CHECK: @llvm.hexagon.L2.loadrb.pcr
+unsigned char test8(int mod, void *start) {
+  char *base = start;
+  return __builtin_HEXAGON_L2_loadrb_pcr(&base, mod, start);
+}
+
+// CHECK-LABEL: test9
+// CHECK: @llvm.hexagon.L2.loadruh.pcr
+unsigned short test9(int mod, void *start) {
+  unsigned short *base = start;
+  return __builtin_HEXAGON_L2_loadruh_pcr(&base, mod, start);
+}
+
+// CHECK-LABEL: test10
+// CHECK: @llvm.hexagon.L2.loadrh.pcr
+short test10(int mod, void *start) {
+  short *base = start;
+  return __builtin_HEXAGON_L2_loadrh_pcr(&base, mod, start);
+}
+
+// CHECK-LABEL: test11
+// CHECK: @llvm.hexagon.L2.loadri.pcr
+int test11(int mod, void *start) {
+  int *base = start;
+  return __builtin_HEXAGON_L2_loadri_pcr(&base, mod, start);
+}
+
+// CHECK-LABEL: test12
+// CHECK: @llvm.hexagon.L2.loadrd.pcr
+long long test12(int mod, void *start) {
+  long long *base = start;
+  return __builtin_HEXAGON_L2_loadrd_pcr(&base, mod, start);
+}
+
+// CHECK-LABEL: test13
+// CHECK: @llvm.hexagon.S2.storerb.pci
+void test13(int mod, void *start, char v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerb_pci(&base, 4, mod, v, start);
+}
+
+// CHECK-LABEL: test14
+// CHECK: @llvm.hexagon.S2.storerh.pci
+void test14(int mod, void *start, short v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerh_pci(&base, 4, mod, v, start);
+}
+
+// CHECK-LABEL: test15
+// CHECK: @llvm.hexagon.S2.storerf.pci
+void test15(int mod, void *start, short v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerf_pci(&base, 4, mod, v, start);
+}
+
+// CHECK-LABEL: test16
+// CHECK: @llvm.hexagon.S2.storeri.pci
+void test16(int mod, void *start, int v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storeri_pci(&base, 4, mod, v, start);
+}
+
+// CHECK-LABEL: test17
+// CHECK: @llvm.hexagon.S2.storerd.pci
+void test17(int mod, void *start, long long v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerd_pci(&base, 8, mod, v, start);
+}
+
+// CHECK-LABEL: test18
+// CHECK: @llvm.hexagon.S2.storerb.pcr
+void test18(int mod, void *start, char v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerb_pcr(&base, mod, v, start);
+}
+
+// CHECK-LABEL: test19
+// CHECK: @llvm.hexagon.S2.storerh.pcr
+void test19(int mod, void *start, short v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerh_pcr(&base, mod, v, start);
+}
+
+// CHECK-LABEL: test20
+// CHECK: @llvm.hexagon.S2.storerf.pcr
+void test20(int mod, void *start, short v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerf_pcr(&base, mod, v, start);
+}
+
+// CHECK-LABEL: test21
+// CHECK: @llvm.hexagon.S2.storeri.pcr
+void test21(int mod, void *start, int v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storeri_pcr(&base, mod, v, start);
+}
+
+// CHECK-LABEL: test22
+// CHECK: @llvm.hexagon.S2.storerd.pcr
+void test22(int mod, void *start, long long v) {
+  void *base = start;
+  __builtin_HEXAGON_S2_storerd_pcr(&base, mod, v, start);
+}
diff --git a/test/CodeGen/builtins-hexagon.c b/test/CodeGen/builtins-hexagon.c
index 22835a23f292..b4c0642ef3c1 100644
--- a/test/CodeGen/builtins-hexagon.c
+++ b/test/CodeGen/builtins-hexagon.c
@@ -1,5 +1,5 @@
 // REQUIRES: hexagon-registered-target
-// RUN: %clang_cc1 -triple hexagon-unknown-elf -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -emit-llvm %s -o - | FileCheck %s
 
 void test() {
   int v64 __attribute__((__vector_size__(64)));
@@ -1722,1594 +1722,6 @@ void test() {
   __builtin_HEXAGON_S6_vtrunehb_ppp(0, 0);
   // CHECK: @llvm.hexagon.S6.vtrunohb.ppp
   __builtin_HEXAGON_S6_vtrunohb_ppp(0, 0);
-  // CHECK: @llvm.hexagon.V6.extractw
-  __builtin_HEXAGON_V6_extractw(v64, 0);
-  // CHECK: @llvm.hexagon.V6.extractw.128B
-  __builtin_HEXAGON_V6_extractw_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.hi
-  __builtin_HEXAGON_V6_hi(v128);
-  // CHECK: @llvm.hexagon.V6.hi.128B
-  __builtin_HEXAGON_V6_hi_128B(v256);
-  // CHECK: @llvm.hexagon.V6.lo
-  __builtin_HEXAGON_V6_lo(v128);
-  // CHECK: @llvm.hexagon.V6.lo.128B
-  __builtin_HEXAGON_V6_lo_128B(v256);
-  // CHECK: @llvm.hexagon.V6.lvsplatb
-  __builtin_HEXAGON_V6_lvsplatb(0);
-  // CHECK: @llvm.hexagon.V6.lvsplatb.128B
-  __builtin_HEXAGON_V6_lvsplatb_128B(0);
-  // CHECK: @llvm.hexagon.V6.lvsplath
-  __builtin_HEXAGON_V6_lvsplath(0);
-  // CHECK: @llvm.hexagon.V6.lvsplath.128B
-  __builtin_HEXAGON_V6_lvsplath_128B(0);
-  // CHECK: @llvm.hexagon.V6.lvsplatw
-  __builtin_HEXAGON_V6_lvsplatw(0);
-  // CHECK: @llvm.hexagon.V6.lvsplatw.128B
-  __builtin_HEXAGON_V6_lvsplatw_128B(0);
-  // CHECK: @llvm.hexagon.V6.pred.and
-  __builtin_HEXAGON_V6_pred_and(v64, v64);
-  // CHECK: @llvm.hexagon.V6.pred.and.128B
-  __builtin_HEXAGON_V6_pred_and_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.pred.and.n
-  __builtin_HEXAGON_V6_pred_and_n(v64, v64);
-  // CHECK: @llvm.hexagon.V6.pred.and.n.128B
-  __builtin_HEXAGON_V6_pred_and_n_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.pred.not
-  __builtin_HEXAGON_V6_pred_not(v64);
-  // CHECK: @llvm.hexagon.V6.pred.not.128B
-  __builtin_HEXAGON_V6_pred_not_128B(v128);
-  // CHECK: @llvm.hexagon.V6.pred.or
-  __builtin_HEXAGON_V6_pred_or(v64, v64);
-  // CHECK: @llvm.hexagon.V6.pred.or.128B
-  __builtin_HEXAGON_V6_pred_or_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.pred.or.n
-  __builtin_HEXAGON_V6_pred_or_n(v64, v64);
-  // CHECK: @llvm.hexagon.V6.pred.or.n.128B
-  __builtin_HEXAGON_V6_pred_or_n_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.pred.scalar2
-  __builtin_HEXAGON_V6_pred_scalar2(0);
-  // CHECK: @llvm.hexagon.V6.pred.scalar2.128B
-  __builtin_HEXAGON_V6_pred_scalar2_128B(0);
-  // CHECK: @llvm.hexagon.V6.pred.scalar2v2
-  __builtin_HEXAGON_V6_pred_scalar2v2(0);
-  // CHECK: @llvm.hexagon.V6.pred.scalar2v2.128B
-  __builtin_HEXAGON_V6_pred_scalar2v2_128B(0);
-  // CHECK: @llvm.hexagon.V6.pred.xor
-  __builtin_HEXAGON_V6_pred_xor(v64, v64);
-  // CHECK: @llvm.hexagon.V6.pred.xor.128B
-  __builtin_HEXAGON_V6_pred_xor_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.shuffeqh
-  __builtin_HEXAGON_V6_shuffeqh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.shuffeqh.128B
-  __builtin_HEXAGON_V6_shuffeqh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.shuffeqw
-  __builtin_HEXAGON_V6_shuffeqw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.shuffeqw.128B
-  __builtin_HEXAGON_V6_shuffeqw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vS32b.nqpred.ai
-  __builtin_HEXAGON_V6_vS32b_nqpred_ai(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vS32b.nqpred.ai.128B
-  __builtin_HEXAGON_V6_vS32b_nqpred_ai_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vS32b.nt.nqpred.ai
-  __builtin_HEXAGON_V6_vS32b_nt_nqpred_ai(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vS32b.nt.nqpred.ai.128B
-  __builtin_HEXAGON_V6_vS32b_nt_nqpred_ai_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vS32b.nt.qpred.ai
-  __builtin_HEXAGON_V6_vS32b_nt_qpred_ai(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vS32b.nt.qpred.ai.128B
-  __builtin_HEXAGON_V6_vS32b_nt_qpred_ai_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vS32b.qpred.ai
-  __builtin_HEXAGON_V6_vS32b_qpred_ai(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vS32b.qpred.ai.128B
-  __builtin_HEXAGON_V6_vS32b_qpred_ai_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vabsb
-  __builtin_HEXAGON_V6_vabsb(v64);
-  // CHECK: @llvm.hexagon.V6.vabsb.128B
-  __builtin_HEXAGON_V6_vabsb_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vabsb.sat
-  __builtin_HEXAGON_V6_vabsb_sat(v64);
-  // CHECK: @llvm.hexagon.V6.vabsb.sat.128B
-  __builtin_HEXAGON_V6_vabsb_sat_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vabsdiffh
-  __builtin_HEXAGON_V6_vabsdiffh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vabsdiffh.128B
-  __builtin_HEXAGON_V6_vabsdiffh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vabsdiffub
-  __builtin_HEXAGON_V6_vabsdiffub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vabsdiffub.128B
-  __builtin_HEXAGON_V6_vabsdiffub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vabsdiffuh
-  __builtin_HEXAGON_V6_vabsdiffuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vabsdiffuh.128B
-  __builtin_HEXAGON_V6_vabsdiffuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vabsdiffw
-  __builtin_HEXAGON_V6_vabsdiffw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vabsdiffw.128B
-  __builtin_HEXAGON_V6_vabsdiffw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vabsh
-  __builtin_HEXAGON_V6_vabsh(v64);
-  // CHECK: @llvm.hexagon.V6.vabsh.128B
-  __builtin_HEXAGON_V6_vabsh_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vabsh.sat
-  __builtin_HEXAGON_V6_vabsh_sat(v64);
-  // CHECK: @llvm.hexagon.V6.vabsh.sat.128B
-  __builtin_HEXAGON_V6_vabsh_sat_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vabsw
-  __builtin_HEXAGON_V6_vabsw(v64);
-  // CHECK: @llvm.hexagon.V6.vabsw.128B
-  __builtin_HEXAGON_V6_vabsw_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vabsw.sat
-  __builtin_HEXAGON_V6_vabsw_sat(v64);
-  // CHECK: @llvm.hexagon.V6.vabsw.sat.128B
-  __builtin_HEXAGON_V6_vabsw_sat_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vaddb
-  __builtin_HEXAGON_V6_vaddb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddb.128B
-  __builtin_HEXAGON_V6_vaddb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddb.dv
-  __builtin_HEXAGON_V6_vaddb_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddb.dv.128B
-  __builtin_HEXAGON_V6_vaddb_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vaddbnq
-  __builtin_HEXAGON_V6_vaddbnq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddbnq.128B
-  __builtin_HEXAGON_V6_vaddbnq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddbq
-  __builtin_HEXAGON_V6_vaddbq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddbq.128B
-  __builtin_HEXAGON_V6_vaddbq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddbsat
-  __builtin_HEXAGON_V6_vaddbsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddbsat.128B
-  __builtin_HEXAGON_V6_vaddbsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddbsat.dv
-  __builtin_HEXAGON_V6_vaddbsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddbsat.dv.128B
-  __builtin_HEXAGON_V6_vaddbsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vaddcarry
-  __builtin_HEXAGON_V6_vaddcarry(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vaddcarry.128B
-  __builtin_HEXAGON_V6_vaddcarry_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vaddclbh
-  __builtin_HEXAGON_V6_vaddclbh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddclbh.128B
-  __builtin_HEXAGON_V6_vaddclbh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddclbw
-  __builtin_HEXAGON_V6_vaddclbw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddclbw.128B
-  __builtin_HEXAGON_V6_vaddclbw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddh
-  __builtin_HEXAGON_V6_vaddh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddh.128B
-  __builtin_HEXAGON_V6_vaddh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddh.dv
-  __builtin_HEXAGON_V6_vaddh_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddh.dv.128B
-  __builtin_HEXAGON_V6_vaddh_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vaddhnq
-  __builtin_HEXAGON_V6_vaddhnq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddhnq.128B
-  __builtin_HEXAGON_V6_vaddhnq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddhq
-  __builtin_HEXAGON_V6_vaddhq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddhq.128B
-  __builtin_HEXAGON_V6_vaddhq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddhsat
-  __builtin_HEXAGON_V6_vaddhsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddhsat.128B
-  __builtin_HEXAGON_V6_vaddhsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddhsat.dv
-  __builtin_HEXAGON_V6_vaddhsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddhsat.dv.128B
-  __builtin_HEXAGON_V6_vaddhsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vaddhw
-  __builtin_HEXAGON_V6_vaddhw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddhw.128B
-  __builtin_HEXAGON_V6_vaddhw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddhw.acc
-  __builtin_HEXAGON_V6_vaddhw_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddhw.acc.128B
-  __builtin_HEXAGON_V6_vaddhw_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddubh
-  __builtin_HEXAGON_V6_vaddubh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddubh.128B
-  __builtin_HEXAGON_V6_vaddubh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddubh.acc
-  __builtin_HEXAGON_V6_vaddubh_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddubh.acc.128B
-  __builtin_HEXAGON_V6_vaddubh_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddubsat
-  __builtin_HEXAGON_V6_vaddubsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddubsat.128B
-  __builtin_HEXAGON_V6_vaddubsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddubsat.dv
-  __builtin_HEXAGON_V6_vaddubsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddubsat.dv.128B
-  __builtin_HEXAGON_V6_vaddubsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vaddububb.sat
-  __builtin_HEXAGON_V6_vaddububb_sat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddububb.sat.128B
-  __builtin_HEXAGON_V6_vaddububb_sat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vadduhsat
-  __builtin_HEXAGON_V6_vadduhsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vadduhsat.128B
-  __builtin_HEXAGON_V6_vadduhsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vadduhsat.dv
-  __builtin_HEXAGON_V6_vadduhsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vadduhsat.dv.128B
-  __builtin_HEXAGON_V6_vadduhsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vadduhw
-  __builtin_HEXAGON_V6_vadduhw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vadduhw.128B
-  __builtin_HEXAGON_V6_vadduhw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vadduhw.acc
-  __builtin_HEXAGON_V6_vadduhw_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vadduhw.acc.128B
-  __builtin_HEXAGON_V6_vadduhw_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vadduwsat
-  __builtin_HEXAGON_V6_vadduwsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vadduwsat.128B
-  __builtin_HEXAGON_V6_vadduwsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vadduwsat.dv
-  __builtin_HEXAGON_V6_vadduwsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vadduwsat.dv.128B
-  __builtin_HEXAGON_V6_vadduwsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vaddw
-  __builtin_HEXAGON_V6_vaddw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddw.128B
-  __builtin_HEXAGON_V6_vaddw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddw.dv
-  __builtin_HEXAGON_V6_vaddw_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddw.dv.128B
-  __builtin_HEXAGON_V6_vaddw_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vaddwnq
-  __builtin_HEXAGON_V6_vaddwnq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddwnq.128B
-  __builtin_HEXAGON_V6_vaddwnq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddwq
-  __builtin_HEXAGON_V6_vaddwq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddwq.128B
-  __builtin_HEXAGON_V6_vaddwq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddwsat
-  __builtin_HEXAGON_V6_vaddwsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaddwsat.128B
-  __builtin_HEXAGON_V6_vaddwsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddwsat.dv
-  __builtin_HEXAGON_V6_vaddwsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaddwsat.dv.128B
-  __builtin_HEXAGON_V6_vaddwsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.valignb
-  __builtin_HEXAGON_V6_valignb(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.valignb.128B
-  __builtin_HEXAGON_V6_valignb_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.valignbi
-  __builtin_HEXAGON_V6_valignbi(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.valignbi.128B
-  __builtin_HEXAGON_V6_valignbi_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vand
-  __builtin_HEXAGON_V6_vand(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vand.128B
-  __builtin_HEXAGON_V6_vand_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vandnqrt
-  __builtin_HEXAGON_V6_vandnqrt(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vandnqrt.128B
-  __builtin_HEXAGON_V6_vandnqrt_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vandnqrt.acc
-  __builtin_HEXAGON_V6_vandnqrt_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vandnqrt.acc.128B
-  __builtin_HEXAGON_V6_vandnqrt_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vandqrt
-  __builtin_HEXAGON_V6_vandqrt(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vandqrt.128B
-  __builtin_HEXAGON_V6_vandqrt_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vandqrt.acc
-  __builtin_HEXAGON_V6_vandqrt_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vandqrt.acc.128B
-  __builtin_HEXAGON_V6_vandqrt_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vandvnqv
-  __builtin_HEXAGON_V6_vandvnqv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vandvnqv.128B
-  __builtin_HEXAGON_V6_vandvnqv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vandvqv
-  __builtin_HEXAGON_V6_vandvqv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vandvqv.128B
-  __builtin_HEXAGON_V6_vandvqv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vandvrt
-  __builtin_HEXAGON_V6_vandvrt(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vandvrt.128B
-  __builtin_HEXAGON_V6_vandvrt_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vandvrt.acc
-  __builtin_HEXAGON_V6_vandvrt_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vandvrt.acc.128B
-  __builtin_HEXAGON_V6_vandvrt_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vaslh
-  __builtin_HEXAGON_V6_vaslh(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vaslh.128B
-  __builtin_HEXAGON_V6_vaslh_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vaslh.acc
-  __builtin_HEXAGON_V6_vaslh_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vaslh.acc.128B
-  __builtin_HEXAGON_V6_vaslh_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vaslhv
-  __builtin_HEXAGON_V6_vaslhv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaslhv.128B
-  __builtin_HEXAGON_V6_vaslhv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vaslw
-  __builtin_HEXAGON_V6_vaslw(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vaslw.128B
-  __builtin_HEXAGON_V6_vaslw_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vaslw.acc
-  __builtin_HEXAGON_V6_vaslw_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vaslw.acc.128B
-  __builtin_HEXAGON_V6_vaslw_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vaslwv
-  __builtin_HEXAGON_V6_vaslwv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vaslwv.128B
-  __builtin_HEXAGON_V6_vaslwv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vasrh
-  __builtin_HEXAGON_V6_vasrh(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrh.128B
-  __builtin_HEXAGON_V6_vasrh_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrh.acc
-  __builtin_HEXAGON_V6_vasrh_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrh.acc.128B
-  __builtin_HEXAGON_V6_vasrh_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhbrndsat
-  __builtin_HEXAGON_V6_vasrhbrndsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhbrndsat.128B
-  __builtin_HEXAGON_V6_vasrhbrndsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhbsat
-  __builtin_HEXAGON_V6_vasrhbsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhbsat.128B
-  __builtin_HEXAGON_V6_vasrhbsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhubrndsat
-  __builtin_HEXAGON_V6_vasrhubrndsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhubrndsat.128B
-  __builtin_HEXAGON_V6_vasrhubrndsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhubsat
-  __builtin_HEXAGON_V6_vasrhubsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhubsat.128B
-  __builtin_HEXAGON_V6_vasrhubsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrhv
-  __builtin_HEXAGON_V6_vasrhv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vasrhv.128B
-  __builtin_HEXAGON_V6_vasrhv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vasruhubrndsat
-  __builtin_HEXAGON_V6_vasruhubrndsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasruhubrndsat.128B
-  __builtin_HEXAGON_V6_vasruhubrndsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasruhubsat
-  __builtin_HEXAGON_V6_vasruhubsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasruhubsat.128B
-  __builtin_HEXAGON_V6_vasruhubsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasruwuhrndsat
-  __builtin_HEXAGON_V6_vasruwuhrndsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasruwuhrndsat.128B
-  __builtin_HEXAGON_V6_vasruwuhrndsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasruwuhsat
-  __builtin_HEXAGON_V6_vasruwuhsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasruwuhsat.128B
-  __builtin_HEXAGON_V6_vasruwuhsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrw
-  __builtin_HEXAGON_V6_vasrw(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrw.128B
-  __builtin_HEXAGON_V6_vasrw_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrw.acc
-  __builtin_HEXAGON_V6_vasrw_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrw.acc.128B
-  __builtin_HEXAGON_V6_vasrw_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwh
-  __builtin_HEXAGON_V6_vasrwh(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwh.128B
-  __builtin_HEXAGON_V6_vasrwh_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwhrndsat
-  __builtin_HEXAGON_V6_vasrwhrndsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwhrndsat.128B
-  __builtin_HEXAGON_V6_vasrwhrndsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwhsat
-  __builtin_HEXAGON_V6_vasrwhsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwhsat.128B
-  __builtin_HEXAGON_V6_vasrwhsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwuhrndsat
-  __builtin_HEXAGON_V6_vasrwuhrndsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwuhrndsat.128B
-  __builtin_HEXAGON_V6_vasrwuhrndsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwuhsat
-  __builtin_HEXAGON_V6_vasrwuhsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwuhsat.128B
-  __builtin_HEXAGON_V6_vasrwuhsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vasrwv
-  __builtin_HEXAGON_V6_vasrwv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vasrwv.128B
-  __builtin_HEXAGON_V6_vasrwv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vassign
-  __builtin_HEXAGON_V6_vassign(v64);
-  // CHECK: @llvm.hexagon.V6.vassign.128B
-  __builtin_HEXAGON_V6_vassign_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vassignp
-  __builtin_HEXAGON_V6_vassignp(v128);
-  // CHECK: @llvm.hexagon.V6.vassignp.128B
-  __builtin_HEXAGON_V6_vassignp_128B(v256);
-  // CHECK: @llvm.hexagon.V6.vavgb
-  __builtin_HEXAGON_V6_vavgb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavgb.128B
-  __builtin_HEXAGON_V6_vavgb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavgbrnd
-  __builtin_HEXAGON_V6_vavgbrnd(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavgbrnd.128B
-  __builtin_HEXAGON_V6_vavgbrnd_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavgh
-  __builtin_HEXAGON_V6_vavgh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavgh.128B
-  __builtin_HEXAGON_V6_vavgh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavghrnd
-  __builtin_HEXAGON_V6_vavghrnd(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavghrnd.128B
-  __builtin_HEXAGON_V6_vavghrnd_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavgub
-  __builtin_HEXAGON_V6_vavgub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavgub.128B
-  __builtin_HEXAGON_V6_vavgub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavgubrnd
-  __builtin_HEXAGON_V6_vavgubrnd(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavgubrnd.128B
-  __builtin_HEXAGON_V6_vavgubrnd_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavguh
-  __builtin_HEXAGON_V6_vavguh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavguh.128B
-  __builtin_HEXAGON_V6_vavguh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavguhrnd
-  __builtin_HEXAGON_V6_vavguhrnd(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavguhrnd.128B
-  __builtin_HEXAGON_V6_vavguhrnd_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavguw
-  __builtin_HEXAGON_V6_vavguw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavguw.128B
-  __builtin_HEXAGON_V6_vavguw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavguwrnd
-  __builtin_HEXAGON_V6_vavguwrnd(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavguwrnd.128B
-  __builtin_HEXAGON_V6_vavguwrnd_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavgw
-  __builtin_HEXAGON_V6_vavgw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavgw.128B
-  __builtin_HEXAGON_V6_vavgw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vavgwrnd
-  __builtin_HEXAGON_V6_vavgwrnd(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vavgwrnd.128B
-  __builtin_HEXAGON_V6_vavgwrnd_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vcl0h
-  __builtin_HEXAGON_V6_vcl0h(v64);
-  // CHECK: @llvm.hexagon.V6.vcl0h.128B
-  __builtin_HEXAGON_V6_vcl0h_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vcl0w
-  __builtin_HEXAGON_V6_vcl0w(v64);
-  // CHECK: @llvm.hexagon.V6.vcl0w.128B
-  __builtin_HEXAGON_V6_vcl0w_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vcombine
-  __builtin_HEXAGON_V6_vcombine(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vcombine.128B
-  __builtin_HEXAGON_V6_vcombine_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vd0
-  __builtin_HEXAGON_V6_vd0();
-  // CHECK: @llvm.hexagon.V6.vd0.128B
-  __builtin_HEXAGON_V6_vd0_128B();
-  // CHECK: @llvm.hexagon.V6.vdd0
-  __builtin_HEXAGON_V6_vdd0();
-  // CHECK: @llvm.hexagon.V6.vdd0.128B
-  __builtin_HEXAGON_V6_vdd0_128B();
-  // CHECK: @llvm.hexagon.V6.vdealb
-  __builtin_HEXAGON_V6_vdealb(v64);
-  // CHECK: @llvm.hexagon.V6.vdealb.128B
-  __builtin_HEXAGON_V6_vdealb_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vdealb4w
-  __builtin_HEXAGON_V6_vdealb4w(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vdealb4w.128B
-  __builtin_HEXAGON_V6_vdealb4w_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vdealh
-  __builtin_HEXAGON_V6_vdealh(v64);
-  // CHECK: @llvm.hexagon.V6.vdealh.128B
-  __builtin_HEXAGON_V6_vdealh_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vdealvdd
-  __builtin_HEXAGON_V6_vdealvdd(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdealvdd.128B
-  __builtin_HEXAGON_V6_vdealvdd_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdelta
-  __builtin_HEXAGON_V6_vdelta(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vdelta.128B
-  __builtin_HEXAGON_V6_vdelta_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vdmpybus
-  __builtin_HEXAGON_V6_vdmpybus(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpybus.128B
-  __builtin_HEXAGON_V6_vdmpybus_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpybus.acc
-  __builtin_HEXAGON_V6_vdmpybus_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpybus.acc.128B
-  __builtin_HEXAGON_V6_vdmpybus_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpybus.dv
-  __builtin_HEXAGON_V6_vdmpybus_dv(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.128B
-  __builtin_HEXAGON_V6_vdmpybus_dv_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc
-  __builtin_HEXAGON_V6_vdmpybus_dv_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc.128B
-  __builtin_HEXAGON_V6_vdmpybus_dv_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb
-  __builtin_HEXAGON_V6_vdmpyhb(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb.128B
-  __builtin_HEXAGON_V6_vdmpyhb_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc
-  __builtin_HEXAGON_V6_vdmpyhb_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc.128B
-  __builtin_HEXAGON_V6_vdmpyhb_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv
-  __builtin_HEXAGON_V6_vdmpyhb_dv(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.128B
-  __builtin_HEXAGON_V6_vdmpyhb_dv_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc
-  __builtin_HEXAGON_V6_vdmpyhb_dv_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc.128B
-  __builtin_HEXAGON_V6_vdmpyhb_dv_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhisat
-  __builtin_HEXAGON_V6_vdmpyhisat(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhisat.128B
-  __builtin_HEXAGON_V6_vdmpyhisat_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc
-  __builtin_HEXAGON_V6_vdmpyhisat_acc(v64, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc.128B
-  __builtin_HEXAGON_V6_vdmpyhisat_acc_128B(v128, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsat
-  __builtin_HEXAGON_V6_vdmpyhsat(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsat.128B
-  __builtin_HEXAGON_V6_vdmpyhsat_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc
-  __builtin_HEXAGON_V6_vdmpyhsat_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc.128B
-  __builtin_HEXAGON_V6_vdmpyhsat_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat
-  __builtin_HEXAGON_V6_vdmpyhsuisat(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.128B
-  __builtin_HEXAGON_V6_vdmpyhsuisat_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc
-  __builtin_HEXAGON_V6_vdmpyhsuisat_acc(v64, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc.128B
-  __builtin_HEXAGON_V6_vdmpyhsuisat_acc_128B(v128, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsusat
-  __builtin_HEXAGON_V6_vdmpyhsusat(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.128B
-  __builtin_HEXAGON_V6_vdmpyhsusat_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc
-  __builtin_HEXAGON_V6_vdmpyhsusat_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc.128B
-  __builtin_HEXAGON_V6_vdmpyhsusat_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdmpyhvsat
-  __builtin_HEXAGON_V6_vdmpyhvsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.128B
-  __builtin_HEXAGON_V6_vdmpyhvsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc
-  __builtin_HEXAGON_V6_vdmpyhvsat_acc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc.128B
-  __builtin_HEXAGON_V6_vdmpyhvsat_acc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vdsaduh
-  __builtin_HEXAGON_V6_vdsaduh(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdsaduh.128B
-  __builtin_HEXAGON_V6_vdsaduh_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vdsaduh.acc
-  __builtin_HEXAGON_V6_vdsaduh_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vdsaduh.acc.128B
-  __builtin_HEXAGON_V6_vdsaduh_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.veqb
-  __builtin_HEXAGON_V6_veqb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqb.128B
-  __builtin_HEXAGON_V6_veqb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqb.and
-  __builtin_HEXAGON_V6_veqb_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqb.and.128B
-  __builtin_HEXAGON_V6_veqb_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqb.or
-  __builtin_HEXAGON_V6_veqb_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqb.or.128B
-  __builtin_HEXAGON_V6_veqb_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqb.xor
-  __builtin_HEXAGON_V6_veqb_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqb.xor.128B
-  __builtin_HEXAGON_V6_veqb_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqh
-  __builtin_HEXAGON_V6_veqh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqh.128B
-  __builtin_HEXAGON_V6_veqh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqh.and
-  __builtin_HEXAGON_V6_veqh_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqh.and.128B
-  __builtin_HEXAGON_V6_veqh_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqh.or
-  __builtin_HEXAGON_V6_veqh_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqh.or.128B
-  __builtin_HEXAGON_V6_veqh_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqh.xor
-  __builtin_HEXAGON_V6_veqh_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqh.xor.128B
-  __builtin_HEXAGON_V6_veqh_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqw
-  __builtin_HEXAGON_V6_veqw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqw.128B
-  __builtin_HEXAGON_V6_veqw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqw.and
-  __builtin_HEXAGON_V6_veqw_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqw.and.128B
-  __builtin_HEXAGON_V6_veqw_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqw.or
-  __builtin_HEXAGON_V6_veqw_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqw.or.128B
-  __builtin_HEXAGON_V6_veqw_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.veqw.xor
-  __builtin_HEXAGON_V6_veqw_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.veqw.xor.128B
-  __builtin_HEXAGON_V6_veqw_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgathermh
-  __builtin_HEXAGON_V6_vgathermh(0, 0, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vgathermh.128B
-  __builtin_HEXAGON_V6_vgathermh_128B(0, 0, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vgathermhq
-  __builtin_HEXAGON_V6_vgathermhq(0, v64, 0, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vgathermhq.128B
-  __builtin_HEXAGON_V6_vgathermhq_128B(0, v128, 0, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vgathermhw
-  __builtin_HEXAGON_V6_vgathermhw(0, 0, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vgathermhw.128B
-  __builtin_HEXAGON_V6_vgathermhw_128B(0, 0, 0, v256);
-  // CHECK: @llvm.hexagon.V6.vgathermhwq
-  __builtin_HEXAGON_V6_vgathermhwq(0, v64, 0, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vgathermhwq.128B
-  __builtin_HEXAGON_V6_vgathermhwq_128B(0, v128, 0, 0, v256);
-  // CHECK: @llvm.hexagon.V6.vgathermw
-  __builtin_HEXAGON_V6_vgathermw(0, 0, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vgathermw.128B
-  __builtin_HEXAGON_V6_vgathermw_128B(0, 0, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vgathermwq
-  __builtin_HEXAGON_V6_vgathermwq(0, v64, 0, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vgathermwq.128B
-  __builtin_HEXAGON_V6_vgathermwq_128B(0, v128, 0, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vgtb
-  __builtin_HEXAGON_V6_vgtb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtb.128B
-  __builtin_HEXAGON_V6_vgtb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtb.and
-  __builtin_HEXAGON_V6_vgtb_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtb.and.128B
-  __builtin_HEXAGON_V6_vgtb_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtb.or
-  __builtin_HEXAGON_V6_vgtb_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtb.or.128B
-  __builtin_HEXAGON_V6_vgtb_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtb.xor
-  __builtin_HEXAGON_V6_vgtb_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtb.xor.128B
-  __builtin_HEXAGON_V6_vgtb_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgth
-  __builtin_HEXAGON_V6_vgth(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgth.128B
-  __builtin_HEXAGON_V6_vgth_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgth.and
-  __builtin_HEXAGON_V6_vgth_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgth.and.128B
-  __builtin_HEXAGON_V6_vgth_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgth.or
-  __builtin_HEXAGON_V6_vgth_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgth.or.128B
-  __builtin_HEXAGON_V6_vgth_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgth.xor
-  __builtin_HEXAGON_V6_vgth_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgth.xor.128B
-  __builtin_HEXAGON_V6_vgth_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtub
-  __builtin_HEXAGON_V6_vgtub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtub.128B
-  __builtin_HEXAGON_V6_vgtub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtub.and
-  __builtin_HEXAGON_V6_vgtub_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtub.and.128B
-  __builtin_HEXAGON_V6_vgtub_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtub.or
-  __builtin_HEXAGON_V6_vgtub_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtub.or.128B
-  __builtin_HEXAGON_V6_vgtub_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtub.xor
-  __builtin_HEXAGON_V6_vgtub_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtub.xor.128B
-  __builtin_HEXAGON_V6_vgtub_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuh
-  __builtin_HEXAGON_V6_vgtuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuh.128B
-  __builtin_HEXAGON_V6_vgtuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuh.and
-  __builtin_HEXAGON_V6_vgtuh_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuh.and.128B
-  __builtin_HEXAGON_V6_vgtuh_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuh.or
-  __builtin_HEXAGON_V6_vgtuh_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuh.or.128B
-  __builtin_HEXAGON_V6_vgtuh_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuh.xor
-  __builtin_HEXAGON_V6_vgtuh_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuh.xor.128B
-  __builtin_HEXAGON_V6_vgtuh_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuw
-  __builtin_HEXAGON_V6_vgtuw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuw.128B
-  __builtin_HEXAGON_V6_vgtuw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuw.and
-  __builtin_HEXAGON_V6_vgtuw_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuw.and.128B
-  __builtin_HEXAGON_V6_vgtuw_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuw.or
-  __builtin_HEXAGON_V6_vgtuw_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuw.or.128B
-  __builtin_HEXAGON_V6_vgtuw_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtuw.xor
-  __builtin_HEXAGON_V6_vgtuw_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtuw.xor.128B
-  __builtin_HEXAGON_V6_vgtuw_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtw
-  __builtin_HEXAGON_V6_vgtw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtw.128B
-  __builtin_HEXAGON_V6_vgtw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtw.and
-  __builtin_HEXAGON_V6_vgtw_and(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtw.and.128B
-  __builtin_HEXAGON_V6_vgtw_and_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtw.or
-  __builtin_HEXAGON_V6_vgtw_or(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtw.or.128B
-  __builtin_HEXAGON_V6_vgtw_or_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vgtw.xor
-  __builtin_HEXAGON_V6_vgtw_xor(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vgtw.xor.128B
-  __builtin_HEXAGON_V6_vgtw_xor_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vinsertwr
-  __builtin_HEXAGON_V6_vinsertwr(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vinsertwr.128B
-  __builtin_HEXAGON_V6_vinsertwr_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlalignb
-  __builtin_HEXAGON_V6_vlalignb(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlalignb.128B
-  __builtin_HEXAGON_V6_vlalignb_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlalignbi
-  __builtin_HEXAGON_V6_vlalignbi(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlalignbi.128B
-  __builtin_HEXAGON_V6_vlalignbi_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlsrb
-  __builtin_HEXAGON_V6_vlsrb(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlsrb.128B
-  __builtin_HEXAGON_V6_vlsrb_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlsrh
-  __builtin_HEXAGON_V6_vlsrh(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlsrh.128B
-  __builtin_HEXAGON_V6_vlsrh_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlsrhv
-  __builtin_HEXAGON_V6_vlsrhv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vlsrhv.128B
-  __builtin_HEXAGON_V6_vlsrhv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vlsrw
-  __builtin_HEXAGON_V6_vlsrw(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlsrw.128B
-  __builtin_HEXAGON_V6_vlsrw_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlsrwv
-  __builtin_HEXAGON_V6_vlsrwv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vlsrwv.128B
-  __builtin_HEXAGON_V6_vlsrwv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vlut4
-  __builtin_HEXAGON_V6_vlut4(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlut4.128B
-  __builtin_HEXAGON_V6_vlut4_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb
-  __builtin_HEXAGON_V6_vlutvvb(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb.128B
-  __builtin_HEXAGON_V6_vlutvvb_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb.nm
-  __builtin_HEXAGON_V6_vlutvvb_nm(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb.nm.128B
-  __builtin_HEXAGON_V6_vlutvvb_nm_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc
-  __builtin_HEXAGON_V6_vlutvvb_oracc(v64, v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc.128B
-  __builtin_HEXAGON_V6_vlutvvb_oracc_128B(v128, v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb.oracci
-  __builtin_HEXAGON_V6_vlutvvb_oracci(v64, v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvb.oracci.128B
-  __builtin_HEXAGON_V6_vlutvvb_oracci_128B(v128, v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvbi
-  __builtin_HEXAGON_V6_vlutvvbi(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvvbi.128B
-  __builtin_HEXAGON_V6_vlutvvbi_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh
-  __builtin_HEXAGON_V6_vlutvwh(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh.128B
-  __builtin_HEXAGON_V6_vlutvwh_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh.nm
-  __builtin_HEXAGON_V6_vlutvwh_nm(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh.nm.128B
-  __builtin_HEXAGON_V6_vlutvwh_nm_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc
-  __builtin_HEXAGON_V6_vlutvwh_oracc(v128, v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc.128B
-  __builtin_HEXAGON_V6_vlutvwh_oracc_128B(v256, v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh.oracci
-  __builtin_HEXAGON_V6_vlutvwh_oracci(v128, v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwh.oracci.128B
-  __builtin_HEXAGON_V6_vlutvwh_oracci_128B(v256, v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwhi
-  __builtin_HEXAGON_V6_vlutvwhi(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vlutvwhi.128B
-  __builtin_HEXAGON_V6_vlutvwhi_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmaskedstorenq
-  __builtin_HEXAGON_V6_vmaskedstorenq(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vmaskedstorenq.128B
-  __builtin_HEXAGON_V6_vmaskedstorenq_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vmaskedstorentnq
-  __builtin_HEXAGON_V6_vmaskedstorentnq(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vmaskedstorentnq.128B
-  __builtin_HEXAGON_V6_vmaskedstorentnq_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vmaskedstorentq
-  __builtin_HEXAGON_V6_vmaskedstorentq(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vmaskedstorentq.128B
-  __builtin_HEXAGON_V6_vmaskedstorentq_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vmaskedstoreq
-  __builtin_HEXAGON_V6_vmaskedstoreq(v64, 0, v64);
-  // CHECK: @llvm.hexagon.V6.vmaskedstoreq.128B
-  __builtin_HEXAGON_V6_vmaskedstoreq_128B(v128, 0, v128);
-  // CHECK: @llvm.hexagon.V6.vmaxb
-  __builtin_HEXAGON_V6_vmaxb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmaxb.128B
-  __builtin_HEXAGON_V6_vmaxb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmaxh
-  __builtin_HEXAGON_V6_vmaxh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmaxh.128B
-  __builtin_HEXAGON_V6_vmaxh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmaxub
-  __builtin_HEXAGON_V6_vmaxub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmaxub.128B
-  __builtin_HEXAGON_V6_vmaxub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmaxuh
-  __builtin_HEXAGON_V6_vmaxuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmaxuh.128B
-  __builtin_HEXAGON_V6_vmaxuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmaxw
-  __builtin_HEXAGON_V6_vmaxw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmaxw.128B
-  __builtin_HEXAGON_V6_vmaxw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vminb
-  __builtin_HEXAGON_V6_vminb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vminb.128B
-  __builtin_HEXAGON_V6_vminb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vminh
-  __builtin_HEXAGON_V6_vminh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vminh.128B
-  __builtin_HEXAGON_V6_vminh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vminub
-  __builtin_HEXAGON_V6_vminub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vminub.128B
-  __builtin_HEXAGON_V6_vminub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vminuh
-  __builtin_HEXAGON_V6_vminuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vminuh.128B
-  __builtin_HEXAGON_V6_vminuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vminw
-  __builtin_HEXAGON_V6_vminw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vminw.128B
-  __builtin_HEXAGON_V6_vminw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpabus
-  __builtin_HEXAGON_V6_vmpabus(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabus.128B
-  __builtin_HEXAGON_V6_vmpabus_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabus.acc
-  __builtin_HEXAGON_V6_vmpabus_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabus.acc.128B
-  __builtin_HEXAGON_V6_vmpabus_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabusv
-  __builtin_HEXAGON_V6_vmpabusv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpabusv.128B
-  __builtin_HEXAGON_V6_vmpabusv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vmpabuu
-  __builtin_HEXAGON_V6_vmpabuu(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabuu.128B
-  __builtin_HEXAGON_V6_vmpabuu_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabuu.acc
-  __builtin_HEXAGON_V6_vmpabuu_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabuu.acc.128B
-  __builtin_HEXAGON_V6_vmpabuu_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpabuuv
-  __builtin_HEXAGON_V6_vmpabuuv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpabuuv.128B
-  __builtin_HEXAGON_V6_vmpabuuv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vmpahb
-  __builtin_HEXAGON_V6_vmpahb(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpahb.128B
-  __builtin_HEXAGON_V6_vmpahb_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpahb.acc
-  __builtin_HEXAGON_V6_vmpahb_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpahb.acc.128B
-  __builtin_HEXAGON_V6_vmpahb_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpahhsat
-  __builtin_HEXAGON_V6_vmpahhsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpahhsat.128B
-  __builtin_HEXAGON_V6_vmpahhsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpauhb
-  __builtin_HEXAGON_V6_vmpauhb(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpauhb.128B
-  __builtin_HEXAGON_V6_vmpauhb_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpauhb.acc
-  __builtin_HEXAGON_V6_vmpauhb_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpauhb.acc.128B
-  __builtin_HEXAGON_V6_vmpauhb_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vmpauhuhsat
-  __builtin_HEXAGON_V6_vmpauhuhsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpauhuhsat.128B
-  __builtin_HEXAGON_V6_vmpauhuhsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpsuhuhsat
-  __builtin_HEXAGON_V6_vmpsuhuhsat(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpsuhuhsat.128B
-  __builtin_HEXAGON_V6_vmpsuhuhsat_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpybus
-  __builtin_HEXAGON_V6_vmpybus(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpybus.128B
-  __builtin_HEXAGON_V6_vmpybus_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpybus.acc
-  __builtin_HEXAGON_V6_vmpybus_acc(v128, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpybus.acc.128B
-  __builtin_HEXAGON_V6_vmpybus_acc_128B(v256, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpybusv
-  __builtin_HEXAGON_V6_vmpybusv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpybusv.128B
-  __builtin_HEXAGON_V6_vmpybusv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpybusv.acc
-  __builtin_HEXAGON_V6_vmpybusv_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpybusv.acc.128B
-  __builtin_HEXAGON_V6_vmpybusv_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpybv
-  __builtin_HEXAGON_V6_vmpybv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpybv.128B
-  __builtin_HEXAGON_V6_vmpybv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpybv.acc
-  __builtin_HEXAGON_V6_vmpybv_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpybv.acc.128B
-  __builtin_HEXAGON_V6_vmpybv_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyewuh
-  __builtin_HEXAGON_V6_vmpyewuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyewuh.128B
-  __builtin_HEXAGON_V6_vmpyewuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyewuh.64
-  __builtin_HEXAGON_V6_vmpyewuh_64(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyewuh.64.128B
-  __builtin_HEXAGON_V6_vmpyewuh_64_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyh
-  __builtin_HEXAGON_V6_vmpyh(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyh.128B
-  __builtin_HEXAGON_V6_vmpyh_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyh.acc
-  __builtin_HEXAGON_V6_vmpyh_acc(v128, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyh.acc.128B
-  __builtin_HEXAGON_V6_vmpyh_acc_128B(v256, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc
-  __builtin_HEXAGON_V6_vmpyhsat_acc(v128, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc.128B
-  __builtin_HEXAGON_V6_vmpyhsat_acc_128B(v256, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyhsrs
-  __builtin_HEXAGON_V6_vmpyhsrs(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyhsrs.128B
-  __builtin_HEXAGON_V6_vmpyhsrs_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyhss
-  __builtin_HEXAGON_V6_vmpyhss(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyhss.128B
-  __builtin_HEXAGON_V6_vmpyhss_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyhus
-  __builtin_HEXAGON_V6_vmpyhus(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyhus.128B
-  __builtin_HEXAGON_V6_vmpyhus_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyhus.acc
-  __builtin_HEXAGON_V6_vmpyhus_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyhus.acc.128B
-  __builtin_HEXAGON_V6_vmpyhus_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyhv
-  __builtin_HEXAGON_V6_vmpyhv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyhv.128B
-  __builtin_HEXAGON_V6_vmpyhv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyhv.acc
-  __builtin_HEXAGON_V6_vmpyhv_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyhv.acc.128B
-  __builtin_HEXAGON_V6_vmpyhv_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyhvsrs
-  __builtin_HEXAGON_V6_vmpyhvsrs(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyhvsrs.128B
-  __builtin_HEXAGON_V6_vmpyhvsrs_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyieoh
-  __builtin_HEXAGON_V6_vmpyieoh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyieoh.128B
-  __builtin_HEXAGON_V6_vmpyieoh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc
-  __builtin_HEXAGON_V6_vmpyiewh_acc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc.128B
-  __builtin_HEXAGON_V6_vmpyiewh_acc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyiewuh
-  __builtin_HEXAGON_V6_vmpyiewuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyiewuh.128B
-  __builtin_HEXAGON_V6_vmpyiewuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc
-  __builtin_HEXAGON_V6_vmpyiewuh_acc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc.128B
-  __builtin_HEXAGON_V6_vmpyiewuh_acc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyih
-  __builtin_HEXAGON_V6_vmpyih(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyih.128B
-  __builtin_HEXAGON_V6_vmpyih_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyih.acc
-  __builtin_HEXAGON_V6_vmpyih_acc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyih.acc.128B
-  __builtin_HEXAGON_V6_vmpyih_acc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyihb
-  __builtin_HEXAGON_V6_vmpyihb(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyihb.128B
-  __builtin_HEXAGON_V6_vmpyihb_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyihb.acc
-  __builtin_HEXAGON_V6_vmpyihb_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyihb.acc.128B
-  __builtin_HEXAGON_V6_vmpyihb_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiowh
-  __builtin_HEXAGON_V6_vmpyiowh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyiowh.128B
-  __builtin_HEXAGON_V6_vmpyiowh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyiwb
-  __builtin_HEXAGON_V6_vmpyiwb(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwb.128B
-  __builtin_HEXAGON_V6_vmpyiwb_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc
-  __builtin_HEXAGON_V6_vmpyiwb_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc.128B
-  __builtin_HEXAGON_V6_vmpyiwb_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwh
-  __builtin_HEXAGON_V6_vmpyiwh(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwh.128B
-  __builtin_HEXAGON_V6_vmpyiwh_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc
-  __builtin_HEXAGON_V6_vmpyiwh_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc.128B
-  __builtin_HEXAGON_V6_vmpyiwh_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwub
-  __builtin_HEXAGON_V6_vmpyiwub(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwub.128B
-  __builtin_HEXAGON_V6_vmpyiwub_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwub.acc
-  __builtin_HEXAGON_V6_vmpyiwub_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyiwub.acc.128B
-  __builtin_HEXAGON_V6_vmpyiwub_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyowh
-  __builtin_HEXAGON_V6_vmpyowh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.128B
-  __builtin_HEXAGON_V6_vmpyowh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.64.acc
-  __builtin_HEXAGON_V6_vmpyowh_64_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.64.acc.128B
-  __builtin_HEXAGON_V6_vmpyowh_64_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd
-  __builtin_HEXAGON_V6_vmpyowh_rnd(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.128B
-  __builtin_HEXAGON_V6_vmpyowh_rnd_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc
-  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc.128B
-  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc
-  __builtin_HEXAGON_V6_vmpyowh_sacc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc.128B
-  __builtin_HEXAGON_V6_vmpyowh_sacc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyub
-  __builtin_HEXAGON_V6_vmpyub(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyub.128B
-  __builtin_HEXAGON_V6_vmpyub_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyub.acc
-  __builtin_HEXAGON_V6_vmpyub_acc(v128, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyub.acc.128B
-  __builtin_HEXAGON_V6_vmpyub_acc_128B(v256, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyubv
-  __builtin_HEXAGON_V6_vmpyubv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyubv.128B
-  __builtin_HEXAGON_V6_vmpyubv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyubv.acc
-  __builtin_HEXAGON_V6_vmpyubv_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyubv.acc.128B
-  __builtin_HEXAGON_V6_vmpyubv_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyuh
-  __builtin_HEXAGON_V6_vmpyuh(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuh.128B
-  __builtin_HEXAGON_V6_vmpyuh_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuh.acc
-  __builtin_HEXAGON_V6_vmpyuh_acc(v128, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuh.acc.128B
-  __builtin_HEXAGON_V6_vmpyuh_acc_128B(v256, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuhe
-  __builtin_HEXAGON_V6_vmpyuhe(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuhe.128B
-  __builtin_HEXAGON_V6_vmpyuhe_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuhe.acc
-  __builtin_HEXAGON_V6_vmpyuhe_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuhe.acc.128B
-  __builtin_HEXAGON_V6_vmpyuhe_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vmpyuhv
-  __builtin_HEXAGON_V6_vmpyuhv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyuhv.128B
-  __builtin_HEXAGON_V6_vmpyuhv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc
-  __builtin_HEXAGON_V6_vmpyuhv_acc(v128, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc.128B
-  __builtin_HEXAGON_V6_vmpyuhv_acc_128B(v256, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vmux
-  __builtin_HEXAGON_V6_vmux(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vmux.128B
-  __builtin_HEXAGON_V6_vmux_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vnavgb
-  __builtin_HEXAGON_V6_vnavgb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vnavgb.128B
-  __builtin_HEXAGON_V6_vnavgb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vnavgh
-  __builtin_HEXAGON_V6_vnavgh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vnavgh.128B
-  __builtin_HEXAGON_V6_vnavgh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vnavgub
-  __builtin_HEXAGON_V6_vnavgub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vnavgub.128B
-  __builtin_HEXAGON_V6_vnavgub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vnavgw
-  __builtin_HEXAGON_V6_vnavgw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vnavgw.128B
-  __builtin_HEXAGON_V6_vnavgw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vnormamth
-  __builtin_HEXAGON_V6_vnormamth(v64);
-  // CHECK: @llvm.hexagon.V6.vnormamth.128B
-  __builtin_HEXAGON_V6_vnormamth_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vnormamtw
-  __builtin_HEXAGON_V6_vnormamtw(v64);
-  // CHECK: @llvm.hexagon.V6.vnormamtw.128B
-  __builtin_HEXAGON_V6_vnormamtw_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vnot
-  __builtin_HEXAGON_V6_vnot(v64);
-  // CHECK: @llvm.hexagon.V6.vnot.128B
-  __builtin_HEXAGON_V6_vnot_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vor
-  __builtin_HEXAGON_V6_vor(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vor.128B
-  __builtin_HEXAGON_V6_vor_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackeb
-  __builtin_HEXAGON_V6_vpackeb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackeb.128B
-  __builtin_HEXAGON_V6_vpackeb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackeh
-  __builtin_HEXAGON_V6_vpackeh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackeh.128B
-  __builtin_HEXAGON_V6_vpackeh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackhb.sat
-  __builtin_HEXAGON_V6_vpackhb_sat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackhb.sat.128B
-  __builtin_HEXAGON_V6_vpackhb_sat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackhub.sat
-  __builtin_HEXAGON_V6_vpackhub_sat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackhub.sat.128B
-  __builtin_HEXAGON_V6_vpackhub_sat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackob
-  __builtin_HEXAGON_V6_vpackob(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackob.128B
-  __builtin_HEXAGON_V6_vpackob_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackoh
-  __builtin_HEXAGON_V6_vpackoh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackoh.128B
-  __builtin_HEXAGON_V6_vpackoh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackwh.sat
-  __builtin_HEXAGON_V6_vpackwh_sat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackwh.sat.128B
-  __builtin_HEXAGON_V6_vpackwh_sat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpackwuh.sat
-  __builtin_HEXAGON_V6_vpackwuh_sat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vpackwuh.sat.128B
-  __builtin_HEXAGON_V6_vpackwuh_sat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vpopcounth
-  __builtin_HEXAGON_V6_vpopcounth(v64);
-  // CHECK: @llvm.hexagon.V6.vpopcounth.128B
-  __builtin_HEXAGON_V6_vpopcounth_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vprefixqb
-  __builtin_HEXAGON_V6_vprefixqb(v64);
-  // CHECK: @llvm.hexagon.V6.vprefixqb.128B
-  __builtin_HEXAGON_V6_vprefixqb_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vprefixqh
-  __builtin_HEXAGON_V6_vprefixqh(v64);
-  // CHECK: @llvm.hexagon.V6.vprefixqh.128B
-  __builtin_HEXAGON_V6_vprefixqh_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vprefixqw
-  __builtin_HEXAGON_V6_vprefixqw(v64);
-  // CHECK: @llvm.hexagon.V6.vprefixqw.128B
-  __builtin_HEXAGON_V6_vprefixqw_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vrdelta
-  __builtin_HEXAGON_V6_vrdelta(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrdelta.128B
-  __builtin_HEXAGON_V6_vrdelta_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt
-  __builtin_HEXAGON_V6_vrmpybub_rtt(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt.128B
-  __builtin_HEXAGON_V6_vrmpybub_rtt_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt.acc
-  __builtin_HEXAGON_V6_vrmpybub_rtt_acc(v128, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt.acc.128B
-  __builtin_HEXAGON_V6_vrmpybub_rtt_acc_128B(v256, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybus
-  __builtin_HEXAGON_V6_vrmpybus(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybus.128B
-  __builtin_HEXAGON_V6_vrmpybus_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybus.acc
-  __builtin_HEXAGON_V6_vrmpybus_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybus.acc.128B
-  __builtin_HEXAGON_V6_vrmpybus_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybusi
-  __builtin_HEXAGON_V6_vrmpybusi(v128, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybusi.128B
-  __builtin_HEXAGON_V6_vrmpybusi_128B(v256, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc
-  __builtin_HEXAGON_V6_vrmpybusi_acc(v128, v128, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc.128B
-  __builtin_HEXAGON_V6_vrmpybusi_acc_128B(v256, v256, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpybusv
-  __builtin_HEXAGON_V6_vrmpybusv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrmpybusv.128B
-  __builtin_HEXAGON_V6_vrmpybusv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc
-  __builtin_HEXAGON_V6_vrmpybusv_acc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc.128B
-  __builtin_HEXAGON_V6_vrmpybusv_acc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrmpybv
-  __builtin_HEXAGON_V6_vrmpybv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrmpybv.128B
-  __builtin_HEXAGON_V6_vrmpybv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrmpybv.acc
-  __builtin_HEXAGON_V6_vrmpybv_acc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrmpybv.acc.128B
-  __builtin_HEXAGON_V6_vrmpybv_acc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrmpyub
-  __builtin_HEXAGON_V6_vrmpyub(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyub.128B
-  __builtin_HEXAGON_V6_vrmpyub_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyub.acc
-  __builtin_HEXAGON_V6_vrmpyub_acc(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyub.acc.128B
-  __builtin_HEXAGON_V6_vrmpyub_acc_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt
-  __builtin_HEXAGON_V6_vrmpyub_rtt(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt.128B
-  __builtin_HEXAGON_V6_vrmpyub_rtt_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt.acc
-  __builtin_HEXAGON_V6_vrmpyub_rtt_acc(v128, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt.acc.128B
-  __builtin_HEXAGON_V6_vrmpyub_rtt_acc_128B(v256, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyubi
-  __builtin_HEXAGON_V6_vrmpyubi(v128, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyubi.128B
-  __builtin_HEXAGON_V6_vrmpyubi_128B(v256, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc
-  __builtin_HEXAGON_V6_vrmpyubi_acc(v128, v128, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc.128B
-  __builtin_HEXAGON_V6_vrmpyubi_acc_128B(v256, v256, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrmpyubv
-  __builtin_HEXAGON_V6_vrmpyubv(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrmpyubv.128B
-  __builtin_HEXAGON_V6_vrmpyubv_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc
-  __builtin_HEXAGON_V6_vrmpyubv_acc(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc.128B
-  __builtin_HEXAGON_V6_vrmpyubv_acc_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vror
-  __builtin_HEXAGON_V6_vror(v64, 0);
-  // CHECK: @llvm.hexagon.V6.vror.128B
-  __builtin_HEXAGON_V6_vror_128B(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vroundhb
-  __builtin_HEXAGON_V6_vroundhb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vroundhb.128B
-  __builtin_HEXAGON_V6_vroundhb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vroundhub
-  __builtin_HEXAGON_V6_vroundhub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vroundhub.128B
-  __builtin_HEXAGON_V6_vroundhub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrounduhub
-  __builtin_HEXAGON_V6_vrounduhub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrounduhub.128B
-  __builtin_HEXAGON_V6_vrounduhub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrounduwuh
-  __builtin_HEXAGON_V6_vrounduwuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vrounduwuh.128B
-  __builtin_HEXAGON_V6_vrounduwuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vroundwh
-  __builtin_HEXAGON_V6_vroundwh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vroundwh.128B
-  __builtin_HEXAGON_V6_vroundwh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vroundwuh
-  __builtin_HEXAGON_V6_vroundwuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vroundwuh.128B
-  __builtin_HEXAGON_V6_vroundwuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vrsadubi
-  __builtin_HEXAGON_V6_vrsadubi(v128, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrsadubi.128B
-  __builtin_HEXAGON_V6_vrsadubi_128B(v256, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrsadubi.acc
-  __builtin_HEXAGON_V6_vrsadubi_acc(v128, v128, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vrsadubi.acc.128B
-  __builtin_HEXAGON_V6_vrsadubi_acc_128B(v256, v256, 0, 0);
-  // CHECK: @llvm.hexagon.V6.vsathub
-  __builtin_HEXAGON_V6_vsathub(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsathub.128B
-  __builtin_HEXAGON_V6_vsathub_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsatuwuh
-  __builtin_HEXAGON_V6_vsatuwuh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsatuwuh.128B
-  __builtin_HEXAGON_V6_vsatuwuh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsatwh
-  __builtin_HEXAGON_V6_vsatwh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsatwh.128B
-  __builtin_HEXAGON_V6_vsatwh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsb
-  __builtin_HEXAGON_V6_vsb(v64);
-  // CHECK: @llvm.hexagon.V6.vsb.128B
-  __builtin_HEXAGON_V6_vsb_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vscattermh
-  __builtin_HEXAGON_V6_vscattermh(0, 0, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermh.128B
-  __builtin_HEXAGON_V6_vscattermh_128B(0, 0, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermh.add
-  __builtin_HEXAGON_V6_vscattermh_add(0, 0, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermh.add.128B
-  __builtin_HEXAGON_V6_vscattermh_add_128B(0, 0, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermhq
-  __builtin_HEXAGON_V6_vscattermhq(v64, 0, 0, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermhq.128B
-  __builtin_HEXAGON_V6_vscattermhq_128B(v128, 0, 0, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermhw
-  __builtin_HEXAGON_V6_vscattermhw(0, 0, v128, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermhw.128B
-  __builtin_HEXAGON_V6_vscattermhw_128B(0, 0, v256, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermhw.add
-  __builtin_HEXAGON_V6_vscattermhw_add(0, 0, v128, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermhw.add.128B
-  __builtin_HEXAGON_V6_vscattermhw_add_128B(0, 0, v256, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermhwq
-  __builtin_HEXAGON_V6_vscattermhwq(v64, 0, 0, v128, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermhwq.128B
-  __builtin_HEXAGON_V6_vscattermhwq_128B(v128, 0, 0, v256, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermw
-  __builtin_HEXAGON_V6_vscattermw(0, 0, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermw.128B
-  __builtin_HEXAGON_V6_vscattermw_128B(0, 0, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermw.add
-  __builtin_HEXAGON_V6_vscattermw_add(0, 0, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermw.add.128B
-  __builtin_HEXAGON_V6_vscattermw_add_128B(0, 0, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vscattermwq
-  __builtin_HEXAGON_V6_vscattermwq(v64, 0, 0, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vscattermwq.128B
-  __builtin_HEXAGON_V6_vscattermwq_128B(v128, 0, 0, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsh
-  __builtin_HEXAGON_V6_vsh(v64);
-  // CHECK: @llvm.hexagon.V6.vsh.128B
-  __builtin_HEXAGON_V6_vsh_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vshufeh
-  __builtin_HEXAGON_V6_vshufeh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vshufeh.128B
-  __builtin_HEXAGON_V6_vshufeh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vshuffb
-  __builtin_HEXAGON_V6_vshuffb(v64);
-  // CHECK: @llvm.hexagon.V6.vshuffb.128B
-  __builtin_HEXAGON_V6_vshuffb_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vshuffeb
-  __builtin_HEXAGON_V6_vshuffeb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vshuffeb.128B
-  __builtin_HEXAGON_V6_vshuffeb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vshuffh
-  __builtin_HEXAGON_V6_vshuffh(v64);
-  // CHECK: @llvm.hexagon.V6.vshuffh.128B
-  __builtin_HEXAGON_V6_vshuffh_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vshuffob
-  __builtin_HEXAGON_V6_vshuffob(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vshuffob.128B
-  __builtin_HEXAGON_V6_vshuffob_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vshuffvdd
-  __builtin_HEXAGON_V6_vshuffvdd(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vshuffvdd.128B
-  __builtin_HEXAGON_V6_vshuffvdd_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vshufoeb
-  __builtin_HEXAGON_V6_vshufoeb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vshufoeb.128B
-  __builtin_HEXAGON_V6_vshufoeb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vshufoeh
-  __builtin_HEXAGON_V6_vshufoeh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vshufoeh.128B
-  __builtin_HEXAGON_V6_vshufoeh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vshufoh
-  __builtin_HEXAGON_V6_vshufoh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vshufoh.128B
-  __builtin_HEXAGON_V6_vshufoh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubb
-  __builtin_HEXAGON_V6_vsubb(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubb.128B
-  __builtin_HEXAGON_V6_vsubb_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubb.dv
-  __builtin_HEXAGON_V6_vsubb_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubb.dv.128B
-  __builtin_HEXAGON_V6_vsubb_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubbnq
-  __builtin_HEXAGON_V6_vsubbnq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubbnq.128B
-  __builtin_HEXAGON_V6_vsubbnq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubbq
-  __builtin_HEXAGON_V6_vsubbq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubbq.128B
-  __builtin_HEXAGON_V6_vsubbq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubbsat
-  __builtin_HEXAGON_V6_vsubbsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubbsat.128B
-  __builtin_HEXAGON_V6_vsubbsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubbsat.dv
-  __builtin_HEXAGON_V6_vsubbsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubbsat.dv.128B
-  __builtin_HEXAGON_V6_vsubbsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubcarry
-  __builtin_HEXAGON_V6_vsubcarry(v64, v64, 0);
-  // CHECK: @llvm.hexagon.V6.vsubcarry.128B
-  __builtin_HEXAGON_V6_vsubcarry_128B(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vsubh
-  __builtin_HEXAGON_V6_vsubh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubh.128B
-  __builtin_HEXAGON_V6_vsubh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubh.dv
-  __builtin_HEXAGON_V6_vsubh_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubh.dv.128B
-  __builtin_HEXAGON_V6_vsubh_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubhnq
-  __builtin_HEXAGON_V6_vsubhnq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubhnq.128B
-  __builtin_HEXAGON_V6_vsubhnq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubhq
-  __builtin_HEXAGON_V6_vsubhq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubhq.128B
-  __builtin_HEXAGON_V6_vsubhq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubhsat
-  __builtin_HEXAGON_V6_vsubhsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubhsat.128B
-  __builtin_HEXAGON_V6_vsubhsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubhsat.dv
-  __builtin_HEXAGON_V6_vsubhsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubhsat.dv.128B
-  __builtin_HEXAGON_V6_vsubhsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubhw
-  __builtin_HEXAGON_V6_vsubhw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubhw.128B
-  __builtin_HEXAGON_V6_vsubhw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsububh
-  __builtin_HEXAGON_V6_vsububh(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsububh.128B
-  __builtin_HEXAGON_V6_vsububh_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsububsat
-  __builtin_HEXAGON_V6_vsububsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsububsat.128B
-  __builtin_HEXAGON_V6_vsububsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsububsat.dv
-  __builtin_HEXAGON_V6_vsububsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsububsat.dv.128B
-  __builtin_HEXAGON_V6_vsububsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubububb.sat
-  __builtin_HEXAGON_V6_vsubububb_sat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubububb.sat.128B
-  __builtin_HEXAGON_V6_vsubububb_sat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubuhsat
-  __builtin_HEXAGON_V6_vsubuhsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubuhsat.128B
-  __builtin_HEXAGON_V6_vsubuhsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv
-  __builtin_HEXAGON_V6_vsubuhsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv.128B
-  __builtin_HEXAGON_V6_vsubuhsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubuhw
-  __builtin_HEXAGON_V6_vsubuhw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubuhw.128B
-  __builtin_HEXAGON_V6_vsubuhw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubuwsat
-  __builtin_HEXAGON_V6_vsubuwsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubuwsat.128B
-  __builtin_HEXAGON_V6_vsubuwsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubuwsat.dv
-  __builtin_HEXAGON_V6_vsubuwsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubuwsat.dv.128B
-  __builtin_HEXAGON_V6_vsubuwsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubw
-  __builtin_HEXAGON_V6_vsubw(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubw.128B
-  __builtin_HEXAGON_V6_vsubw_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubw.dv
-  __builtin_HEXAGON_V6_vsubw_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubw.dv.128B
-  __builtin_HEXAGON_V6_vsubw_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vsubwnq
-  __builtin_HEXAGON_V6_vsubwnq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubwnq.128B
-  __builtin_HEXAGON_V6_vsubwnq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubwq
-  __builtin_HEXAGON_V6_vsubwq(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubwq.128B
-  __builtin_HEXAGON_V6_vsubwq_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubwsat
-  __builtin_HEXAGON_V6_vsubwsat(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vsubwsat.128B
-  __builtin_HEXAGON_V6_vsubwsat_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubwsat.dv
-  __builtin_HEXAGON_V6_vsubwsat_dv(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vsubwsat.dv.128B
-  __builtin_HEXAGON_V6_vsubwsat_dv_128B(v256, v256);
-  // CHECK: @llvm.hexagon.V6.vswap
-  __builtin_HEXAGON_V6_vswap(v64, v64, v64);
-  // CHECK: @llvm.hexagon.V6.vswap.128B
-  __builtin_HEXAGON_V6_vswap_128B(v128, v128, v128);
-  // CHECK: @llvm.hexagon.V6.vtmpyb
-  __builtin_HEXAGON_V6_vtmpyb(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpyb.128B
-  __builtin_HEXAGON_V6_vtmpyb_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpyb.acc
-  __builtin_HEXAGON_V6_vtmpyb_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpyb.acc.128B
-  __builtin_HEXAGON_V6_vtmpyb_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpybus
-  __builtin_HEXAGON_V6_vtmpybus(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpybus.128B
-  __builtin_HEXAGON_V6_vtmpybus_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpybus.acc
-  __builtin_HEXAGON_V6_vtmpybus_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpybus.acc.128B
-  __builtin_HEXAGON_V6_vtmpybus_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpyhb
-  __builtin_HEXAGON_V6_vtmpyhb(v128, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpyhb.128B
-  __builtin_HEXAGON_V6_vtmpyhb_128B(v256, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc
-  __builtin_HEXAGON_V6_vtmpyhb_acc(v128, v128, 0);
-  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc.128B
-  __builtin_HEXAGON_V6_vtmpyhb_acc_128B(v256, v256, 0);
-  // CHECK: @llvm.hexagon.V6.vunpackb
-  __builtin_HEXAGON_V6_vunpackb(v64);
-  // CHECK: @llvm.hexagon.V6.vunpackb.128B
-  __builtin_HEXAGON_V6_vunpackb_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vunpackh
-  __builtin_HEXAGON_V6_vunpackh(v64);
-  // CHECK: @llvm.hexagon.V6.vunpackh.128B
-  __builtin_HEXAGON_V6_vunpackh_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vunpackob
-  __builtin_HEXAGON_V6_vunpackob(v128, v64);
-  // CHECK: @llvm.hexagon.V6.vunpackob.128B
-  __builtin_HEXAGON_V6_vunpackob_128B(v256, v128);
-  // CHECK: @llvm.hexagon.V6.vunpackoh
-  __builtin_HEXAGON_V6_vunpackoh(v128, v64);
-  // CHECK: @llvm.hexagon.V6.vunpackoh.128B
-  __builtin_HEXAGON_V6_vunpackoh_128B(v256, v128);
-  // CHECK: @llvm.hexagon.V6.vunpackub
-  __builtin_HEXAGON_V6_vunpackub(v64);
-  // CHECK: @llvm.hexagon.V6.vunpackub.128B
-  __builtin_HEXAGON_V6_vunpackub_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vunpackuh
-  __builtin_HEXAGON_V6_vunpackuh(v64);
-  // CHECK: @llvm.hexagon.V6.vunpackuh.128B
-  __builtin_HEXAGON_V6_vunpackuh_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vxor
-  __builtin_HEXAGON_V6_vxor(v64, v64);
-  // CHECK: @llvm.hexagon.V6.vxor.128B
-  __builtin_HEXAGON_V6_vxor_128B(v128, v128);
-  // CHECK: @llvm.hexagon.V6.vzb
-  __builtin_HEXAGON_V6_vzb(v64);
-  // CHECK: @llvm.hexagon.V6.vzb.128B
-  __builtin_HEXAGON_V6_vzb_128B(v128);
-  // CHECK: @llvm.hexagon.V6.vzh
-  __builtin_HEXAGON_V6_vzh(v64);
-  // CHECK: @llvm.hexagon.V6.vzh.128B
-  __builtin_HEXAGON_V6_vzh_128B(v128);
   // CHECK: @llvm.hexagon.Y2.dccleana
   __builtin_HEXAGON_Y2_dccleana(0);
   // CHECK: @llvm.hexagon.Y2.dccleaninva
@@ -3322,28 +1734,30 @@ void test() {
   __builtin_HEXAGON_Y4_l2fetch(0, 0);
   // CHECK: @llvm.hexagon.Y5.l2fetch
   __builtin_HEXAGON_Y5_l2fetch(0, 0);
-  // CHECK: @llvm.hexagon.brev.ldb
+
+  // CHECK: @llvm.hexagon.L2.loadrb.pbr
   __builtin_brev_ldb(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.ldd
+  // CHECK: @llvm.hexagon.L2.loadrd.pbr
   __builtin_brev_ldd(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.ldh
+  // CHECK: @llvm.hexagon.L2.loadrh.pbr
   __builtin_brev_ldh(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.ldub
+  // CHECK: @llvm.hexagon.L2.loadrub.pbr
   __builtin_brev_ldub(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.lduh
+  // CHECK: @llvm.hexagon.L2.loadruh.pbr
   __builtin_brev_lduh(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.ldw
+  // CHECK: @llvm.hexagon.L2.loadri.pbr
   __builtin_brev_ldw(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.stb
+  // CHECK: @llvm.hexagon.S2.storerb.pbr
   __builtin_brev_stb(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.std
-  __builtin_brev_std(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.sth
+  // CHECK: @llvm.hexagon.S2.storerd.pbr
+  __builtin_brev_std(0, 0LL, 0);
+  // CHECK: @llvm.hexagon.S2.storerh.pbr
   __builtin_brev_sth(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.sthhi
+  // CHECK: @llvm.hexagon.S2.storerf.pbr
   __builtin_brev_sthhi(0, 0, 0);
-  // CHECK: @llvm.hexagon.brev.stw
+  // CHECK: @llvm.hexagon.S2.storeri.pbr
   __builtin_brev_stw(0, 0, 0);
+
   // CHECK: @llvm.hexagon.circ.ldb
   __builtin_circ_ldb(0, 0, 0, 0);
   // CHECK: @llvm.hexagon.circ.ldd
diff --git a/test/CodeGen/builtins-hvx128.c b/test/CodeGen/builtins-hvx128.c
new file mode 100644
index 000000000000..07d0e050ddc0
--- /dev/null
+++ b/test/CodeGen/builtins-hvx128.c
@@ -0,0 +1,802 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -target-feature +hvx-length128b -emit-llvm %s -o - | FileCheck %s
+
+void test() {
+  int v128 __attribute__((__vector_size__(128)));
+  int v256 __attribute__((__vector_size__(256)));
+
+  // CHECK: @llvm.hexagon.V6.extractw.128B
+  __builtin_HEXAGON_V6_extractw_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.hi.128B
+  __builtin_HEXAGON_V6_hi_128B(v256);
+  // CHECK: @llvm.hexagon.V6.lo.128B
+  __builtin_HEXAGON_V6_lo_128B(v256);
+  // CHECK: @llvm.hexagon.V6.lvsplatb.128B
+  __builtin_HEXAGON_V6_lvsplatb_128B(0);
+  // CHECK: @llvm.hexagon.V6.lvsplath.128B
+  __builtin_HEXAGON_V6_lvsplath_128B(0);
+  // CHECK: @llvm.hexagon.V6.lvsplatw.128B
+  __builtin_HEXAGON_V6_lvsplatw_128B(0);
+  // CHECK: @llvm.hexagon.V6.pred.and.128B
+  __builtin_HEXAGON_V6_pred_and_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.pred.and.n.128B
+  __builtin_HEXAGON_V6_pred_and_n_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.pred.not.128B
+  __builtin_HEXAGON_V6_pred_not_128B(v128);
+  // CHECK: @llvm.hexagon.V6.pred.or.128B
+  __builtin_HEXAGON_V6_pred_or_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.pred.or.n.128B
+  __builtin_HEXAGON_V6_pred_or_n_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2.128B
+  __builtin_HEXAGON_V6_pred_scalar2_128B(0);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2v2.128B
+  __builtin_HEXAGON_V6_pred_scalar2v2_128B(0);
+  // CHECK: @llvm.hexagon.V6.pred.xor.128B
+  __builtin_HEXAGON_V6_pred_xor_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.shuffeqh.128B
+  __builtin_HEXAGON_V6_shuffeqh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.shuffeqw.128B
+  __builtin_HEXAGON_V6_shuffeqw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vS32b.nqpred.ai.128B
+  __builtin_HEXAGON_V6_vS32b_nqpred_ai_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vS32b.nt.nqpred.ai.128B
+  __builtin_HEXAGON_V6_vS32b_nt_nqpred_ai_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vS32b.nt.qpred.ai.128B
+  __builtin_HEXAGON_V6_vS32b_nt_qpred_ai_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vS32b.qpred.ai.128B
+  __builtin_HEXAGON_V6_vS32b_qpred_ai_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vabsb.128B
+  __builtin_HEXAGON_V6_vabsb_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vabsb.sat.128B
+  __builtin_HEXAGON_V6_vabsb_sat_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vabsdiffh.128B
+  __builtin_HEXAGON_V6_vabsdiffh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vabsdiffub.128B
+  __builtin_HEXAGON_V6_vabsdiffub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vabsdiffuh.128B
+  __builtin_HEXAGON_V6_vabsdiffuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vabsdiffw.128B
+  __builtin_HEXAGON_V6_vabsdiffw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vabsh.128B
+  __builtin_HEXAGON_V6_vabsh_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vabsh.sat.128B
+  __builtin_HEXAGON_V6_vabsh_sat_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vabsw.128B
+  __builtin_HEXAGON_V6_vabsw_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vabsw.sat.128B
+  __builtin_HEXAGON_V6_vabsw_sat_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vaddb.128B
+  __builtin_HEXAGON_V6_vaddb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddb.dv.128B
+  __builtin_HEXAGON_V6_vaddb_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vaddbnq.128B
+  __builtin_HEXAGON_V6_vaddbnq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddbq.128B
+  __builtin_HEXAGON_V6_vaddbq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddbsat.128B
+  __builtin_HEXAGON_V6_vaddbsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddbsat.dv.128B
+  __builtin_HEXAGON_V6_vaddbsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vaddcarry.128B
+  __builtin_HEXAGON_V6_vaddcarry_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vaddclbh.128B
+  __builtin_HEXAGON_V6_vaddclbh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddclbw.128B
+  __builtin_HEXAGON_V6_vaddclbw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddh.128B
+  __builtin_HEXAGON_V6_vaddh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddh.dv.128B
+  __builtin_HEXAGON_V6_vaddh_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vaddhnq.128B
+  __builtin_HEXAGON_V6_vaddhnq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddhq.128B
+  __builtin_HEXAGON_V6_vaddhq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.128B
+  __builtin_HEXAGON_V6_vaddhsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.dv.128B
+  __builtin_HEXAGON_V6_vaddhsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vaddhw.128B
+  __builtin_HEXAGON_V6_vaddhw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddhw.acc.128B
+  __builtin_HEXAGON_V6_vaddhw_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddubh.128B
+  __builtin_HEXAGON_V6_vaddubh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddubh.acc.128B
+  __builtin_HEXAGON_V6_vaddubh_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.128B
+  __builtin_HEXAGON_V6_vaddubsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.dv.128B
+  __builtin_HEXAGON_V6_vaddubsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vaddububb.sat.128B
+  __builtin_HEXAGON_V6_vaddububb_sat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.128B
+  __builtin_HEXAGON_V6_vadduhsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.dv.128B
+  __builtin_HEXAGON_V6_vadduhsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vadduhw.128B
+  __builtin_HEXAGON_V6_vadduhw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vadduhw.acc.128B
+  __builtin_HEXAGON_V6_vadduhw_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vadduwsat.128B
+  __builtin_HEXAGON_V6_vadduwsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vadduwsat.dv.128B
+  __builtin_HEXAGON_V6_vadduwsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vaddw.128B
+  __builtin_HEXAGON_V6_vaddw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddw.dv.128B
+  __builtin_HEXAGON_V6_vaddw_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vaddwnq.128B
+  __builtin_HEXAGON_V6_vaddwnq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddwq.128B
+  __builtin_HEXAGON_V6_vaddwq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.128B
+  __builtin_HEXAGON_V6_vaddwsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.dv.128B
+  __builtin_HEXAGON_V6_vaddwsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.valignb.128B
+  __builtin_HEXAGON_V6_valignb_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.valignbi.128B
+  __builtin_HEXAGON_V6_valignbi_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vand.128B
+  __builtin_HEXAGON_V6_vand_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vandnqrt.128B
+  __builtin_HEXAGON_V6_vandnqrt_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vandnqrt.acc.128B
+  __builtin_HEXAGON_V6_vandnqrt_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.128B
+  __builtin_HEXAGON_V6_vandqrt_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.acc.128B
+  __builtin_HEXAGON_V6_vandqrt_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vandvnqv.128B
+  __builtin_HEXAGON_V6_vandvnqv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vandvqv.128B
+  __builtin_HEXAGON_V6_vandvqv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vandvrt.128B
+  __builtin_HEXAGON_V6_vandvrt_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.acc.128B
+  __builtin_HEXAGON_V6_vandvrt_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh.128B
+  __builtin_HEXAGON_V6_vaslh_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh.acc.128B
+  __builtin_HEXAGON_V6_vaslh_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vaslhv.128B
+  __builtin_HEXAGON_V6_vaslhv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaslw.128B
+  __builtin_HEXAGON_V6_vaslw_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.acc.128B
+  __builtin_HEXAGON_V6_vaslw_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vaslwv.128B
+  __builtin_HEXAGON_V6_vaslwv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vasrh.128B
+  __builtin_HEXAGON_V6_vasrh_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrh.acc.128B
+  __builtin_HEXAGON_V6_vasrh_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbrndsat.128B
+  __builtin_HEXAGON_V6_vasrhbrndsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbsat.128B
+  __builtin_HEXAGON_V6_vasrhbsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubrndsat.128B
+  __builtin_HEXAGON_V6_vasrhubrndsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubsat.128B
+  __builtin_HEXAGON_V6_vasrhubsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhv.128B
+  __builtin_HEXAGON_V6_vasrhv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vasruhubrndsat.128B
+  __builtin_HEXAGON_V6_vasruhubrndsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasruhubsat.128B
+  __builtin_HEXAGON_V6_vasruhubsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasruwuhrndsat.128B
+  __builtin_HEXAGON_V6_vasruwuhrndsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasruwuhsat.128B
+  __builtin_HEXAGON_V6_vasruwuhsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.128B
+  __builtin_HEXAGON_V6_vasrw_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.acc.128B
+  __builtin_HEXAGON_V6_vasrw_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwh.128B
+  __builtin_HEXAGON_V6_vasrwh_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhrndsat.128B
+  __builtin_HEXAGON_V6_vasrwhrndsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhsat.128B
+  __builtin_HEXAGON_V6_vasrwhsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhrndsat.128B
+  __builtin_HEXAGON_V6_vasrwuhrndsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhsat.128B
+  __builtin_HEXAGON_V6_vasrwuhsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwv.128B
+  __builtin_HEXAGON_V6_vasrwv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vassign.128B
+  __builtin_HEXAGON_V6_vassign_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vassignp.128B
+  __builtin_HEXAGON_V6_vassignp_128B(v256);
+  // CHECK: @llvm.hexagon.V6.vavgb.128B
+  __builtin_HEXAGON_V6_vavgb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavgbrnd.128B
+  __builtin_HEXAGON_V6_vavgbrnd_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavgh.128B
+  __builtin_HEXAGON_V6_vavgh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavghrnd.128B
+  __builtin_HEXAGON_V6_vavghrnd_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavgub.128B
+  __builtin_HEXAGON_V6_vavgub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavgubrnd.128B
+  __builtin_HEXAGON_V6_vavgubrnd_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavguh.128B
+  __builtin_HEXAGON_V6_vavguh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavguhrnd.128B
+  __builtin_HEXAGON_V6_vavguhrnd_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavguw.128B
+  __builtin_HEXAGON_V6_vavguw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavguwrnd.128B
+  __builtin_HEXAGON_V6_vavguwrnd_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavgw.128B
+  __builtin_HEXAGON_V6_vavgw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vavgwrnd.128B
+  __builtin_HEXAGON_V6_vavgwrnd_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vcl0h.128B
+  __builtin_HEXAGON_V6_vcl0h_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vcl0w.128B
+  __builtin_HEXAGON_V6_vcl0w_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vcombine.128B
+  __builtin_HEXAGON_V6_vcombine_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vd0.128B
+  __builtin_HEXAGON_V6_vd0_128B();
+  // CHECK: @llvm.hexagon.V6.vdd0.128B
+  __builtin_HEXAGON_V6_vdd0_128B();
+  // CHECK: @llvm.hexagon.V6.vdealb.128B
+  __builtin_HEXAGON_V6_vdealb_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vdealb4w.128B
+  __builtin_HEXAGON_V6_vdealb4w_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vdealh.128B
+  __builtin_HEXAGON_V6_vdealh_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vdealvdd.128B
+  __builtin_HEXAGON_V6_vdealvdd_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdelta.128B
+  __builtin_HEXAGON_V6_vdelta_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.128B
+  __builtin_HEXAGON_V6_vdmpybus_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.acc.128B
+  __builtin_HEXAGON_V6_vdmpybus_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.128B
+  __builtin_HEXAGON_V6_vdmpybus_dv_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc.128B
+  __builtin_HEXAGON_V6_vdmpybus_dv_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.128B
+  __builtin_HEXAGON_V6_vdmpyhb_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhb_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.128B
+  __builtin_HEXAGON_V6_vdmpyhb_dv_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhb_dv_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.128B
+  __builtin_HEXAGON_V6_vdmpyhisat_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhisat_acc_128B(v128, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.128B
+  __builtin_HEXAGON_V6_vdmpyhsat_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsat_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.128B
+  __builtin_HEXAGON_V6_vdmpyhsuisat_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsuisat_acc_128B(v128, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.128B
+  __builtin_HEXAGON_V6_vdmpyhsusat_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsusat_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.128B
+  __builtin_HEXAGON_V6_vdmpyhvsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhvsat_acc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.128B
+  __builtin_HEXAGON_V6_vdsaduh_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.acc.128B
+  __builtin_HEXAGON_V6_vdsaduh_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.veqb.128B
+  __builtin_HEXAGON_V6_veqb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqb.and.128B
+  __builtin_HEXAGON_V6_veqb_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqb.or.128B
+  __builtin_HEXAGON_V6_veqb_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqb.xor.128B
+  __builtin_HEXAGON_V6_veqb_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqh.128B
+  __builtin_HEXAGON_V6_veqh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqh.and.128B
+  __builtin_HEXAGON_V6_veqh_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqh.or.128B
+  __builtin_HEXAGON_V6_veqh_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqh.xor.128B
+  __builtin_HEXAGON_V6_veqh_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqw.128B
+  __builtin_HEXAGON_V6_veqw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqw.and.128B
+  __builtin_HEXAGON_V6_veqw_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqw.or.128B
+  __builtin_HEXAGON_V6_veqw_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.veqw.xor.128B
+  __builtin_HEXAGON_V6_veqw_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgathermh.128B
+  __builtin_HEXAGON_V6_vgathermh_128B(0, 0, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vgathermhq.128B
+  __builtin_HEXAGON_V6_vgathermhq_128B(0, v128, 0, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vgathermhw.128B
+  __builtin_HEXAGON_V6_vgathermhw_128B(0, 0, 0, v256);
+  // CHECK: @llvm.hexagon.V6.vgathermhwq.128B
+  __builtin_HEXAGON_V6_vgathermhwq_128B(0, v128, 0, 0, v256);
+  // CHECK: @llvm.hexagon.V6.vgathermw.128B
+  __builtin_HEXAGON_V6_vgathermw_128B(0, 0, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vgathermwq.128B
+  __builtin_HEXAGON_V6_vgathermwq_128B(0, v128, 0, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vgtb.128B
+  __builtin_HEXAGON_V6_vgtb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtb.and.128B
+  __builtin_HEXAGON_V6_vgtb_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtb.or.128B
+  __builtin_HEXAGON_V6_vgtb_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtb.xor.128B
+  __builtin_HEXAGON_V6_vgtb_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgth.128B
+  __builtin_HEXAGON_V6_vgth_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgth.and.128B
+  __builtin_HEXAGON_V6_vgth_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgth.or.128B
+  __builtin_HEXAGON_V6_vgth_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgth.xor.128B
+  __builtin_HEXAGON_V6_vgth_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtub.128B
+  __builtin_HEXAGON_V6_vgtub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtub.and.128B
+  __builtin_HEXAGON_V6_vgtub_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtub.or.128B
+  __builtin_HEXAGON_V6_vgtub_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtub.xor.128B
+  __builtin_HEXAGON_V6_vgtub_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuh.128B
+  __builtin_HEXAGON_V6_vgtuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuh.and.128B
+  __builtin_HEXAGON_V6_vgtuh_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuh.or.128B
+  __builtin_HEXAGON_V6_vgtuh_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuh.xor.128B
+  __builtin_HEXAGON_V6_vgtuh_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuw.128B
+  __builtin_HEXAGON_V6_vgtuw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuw.and.128B
+  __builtin_HEXAGON_V6_vgtuw_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuw.or.128B
+  __builtin_HEXAGON_V6_vgtuw_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtuw.xor.128B
+  __builtin_HEXAGON_V6_vgtuw_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtw.128B
+  __builtin_HEXAGON_V6_vgtw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtw.and.128B
+  __builtin_HEXAGON_V6_vgtw_and_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtw.or.128B
+  __builtin_HEXAGON_V6_vgtw_or_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vgtw.xor.128B
+  __builtin_HEXAGON_V6_vgtw_xor_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vinsertwr.128B
+  __builtin_HEXAGON_V6_vinsertwr_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignb.128B
+  __builtin_HEXAGON_V6_vlalignb_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignbi.128B
+  __builtin_HEXAGON_V6_vlalignbi_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrb.128B
+  __builtin_HEXAGON_V6_vlsrb_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrh.128B
+  __builtin_HEXAGON_V6_vlsrh_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrhv.128B
+  __builtin_HEXAGON_V6_vlsrhv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vlsrw.128B
+  __builtin_HEXAGON_V6_vlsrw_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrwv.128B
+  __builtin_HEXAGON_V6_vlsrwv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vlut4.128B
+  __builtin_HEXAGON_V6_vlut4_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.128B
+  __builtin_HEXAGON_V6_vlutvvb_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.nm.128B
+  __builtin_HEXAGON_V6_vlutvvb_nm_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc.128B
+  __builtin_HEXAGON_V6_vlutvvb_oracc_128B(v128, v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracci.128B
+  __builtin_HEXAGON_V6_vlutvvb_oracci_128B(v128, v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvbi.128B
+  __builtin_HEXAGON_V6_vlutvvbi_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.128B
+  __builtin_HEXAGON_V6_vlutvwh_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.nm.128B
+  __builtin_HEXAGON_V6_vlutvwh_nm_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc.128B
+  __builtin_HEXAGON_V6_vlutvwh_oracc_128B(v256, v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracci.128B
+  __builtin_HEXAGON_V6_vlutvwh_oracci_128B(v256, v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwhi.128B
+  __builtin_HEXAGON_V6_vlutvwhi_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmaskedstorenq.128B
+  __builtin_HEXAGON_V6_vmaskedstorenq_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vmaskedstorentnq.128B
+  __builtin_HEXAGON_V6_vmaskedstorentnq_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vmaskedstorentq.128B
+  __builtin_HEXAGON_V6_vmaskedstorentq_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vmaskedstoreq.128B
+  __builtin_HEXAGON_V6_vmaskedstoreq_128B(v128, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vmaxb.128B
+  __builtin_HEXAGON_V6_vmaxb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmaxh.128B
+  __builtin_HEXAGON_V6_vmaxh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmaxub.128B
+  __builtin_HEXAGON_V6_vmaxub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmaxuh.128B
+  __builtin_HEXAGON_V6_vmaxuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmaxw.128B
+  __builtin_HEXAGON_V6_vmaxw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vminb.128B
+  __builtin_HEXAGON_V6_vminb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vminh.128B
+  __builtin_HEXAGON_V6_vminh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vminub.128B
+  __builtin_HEXAGON_V6_vminub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vminuh.128B
+  __builtin_HEXAGON_V6_vminuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vminw.128B
+  __builtin_HEXAGON_V6_vminw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpabus.128B
+  __builtin_HEXAGON_V6_vmpabus_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.acc.128B
+  __builtin_HEXAGON_V6_vmpabus_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabusv.128B
+  __builtin_HEXAGON_V6_vmpabusv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vmpabuu.128B
+  __builtin_HEXAGON_V6_vmpabuu_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabuu.acc.128B
+  __builtin_HEXAGON_V6_vmpabuu_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabuuv.128B
+  __builtin_HEXAGON_V6_vmpabuuv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vmpahb.128B
+  __builtin_HEXAGON_V6_vmpahb_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.acc.128B
+  __builtin_HEXAGON_V6_vmpahb_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahhsat.128B
+  __builtin_HEXAGON_V6_vmpahhsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpauhb.128B
+  __builtin_HEXAGON_V6_vmpauhb_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpauhb.acc.128B
+  __builtin_HEXAGON_V6_vmpauhb_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vmpauhuhsat.128B
+  __builtin_HEXAGON_V6_vmpauhuhsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpsuhuhsat.128B
+  __builtin_HEXAGON_V6_vmpsuhuhsat_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.128B
+  __builtin_HEXAGON_V6_vmpybus_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.acc.128B
+  __builtin_HEXAGON_V6_vmpybus_acc_128B(v256, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.128B
+  __builtin_HEXAGON_V6_vmpybusv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.acc.128B
+  __builtin_HEXAGON_V6_vmpybusv_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpybv.128B
+  __builtin_HEXAGON_V6_vmpybv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpybv.acc.128B
+  __builtin_HEXAGON_V6_vmpybv_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh.128B
+  __builtin_HEXAGON_V6_vmpyewuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh.64.128B
+  __builtin_HEXAGON_V6_vmpyewuh_64_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyh.128B
+  __builtin_HEXAGON_V6_vmpyh_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyh.acc.128B
+  __builtin_HEXAGON_V6_vmpyh_acc_128B(v256, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc.128B
+  __builtin_HEXAGON_V6_vmpyhsat_acc_128B(v256, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsrs.128B
+  __builtin_HEXAGON_V6_vmpyhsrs_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhss.128B
+  __builtin_HEXAGON_V6_vmpyhss_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.128B
+  __builtin_HEXAGON_V6_vmpyhus_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.acc.128B
+  __builtin_HEXAGON_V6_vmpyhus_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.128B
+  __builtin_HEXAGON_V6_vmpyhv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.acc.128B
+  __builtin_HEXAGON_V6_vmpyhv_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyhvsrs.128B
+  __builtin_HEXAGON_V6_vmpyhvsrs_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyieoh.128B
+  __builtin_HEXAGON_V6_vmpyieoh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiewh_acc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.128B
+  __builtin_HEXAGON_V6_vmpyiewuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiewuh_acc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyih.128B
+  __builtin_HEXAGON_V6_vmpyih_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyih.acc.128B
+  __builtin_HEXAGON_V6_vmpyih_acc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.128B
+  __builtin_HEXAGON_V6_vmpyihb_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.acc.128B
+  __builtin_HEXAGON_V6_vmpyihb_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiowh.128B
+  __builtin_HEXAGON_V6_vmpyiowh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.128B
+  __builtin_HEXAGON_V6_vmpyiwb_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc.128B
+  __builtin_HEXAGON_V6_vmpyiwb_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.128B
+  __builtin_HEXAGON_V6_vmpyiwh_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiwh_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwub.128B
+  __builtin_HEXAGON_V6_vmpyiwub_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwub.acc.128B
+  __builtin_HEXAGON_V6_vmpyiwub_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.128B
+  __builtin_HEXAGON_V6_vmpyowh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.64.acc.128B
+  __builtin_HEXAGON_V6_vmpyowh_64_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc.128B
+  __builtin_HEXAGON_V6_vmpyowh_sacc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyub.128B
+  __builtin_HEXAGON_V6_vmpyub_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.acc.128B
+  __builtin_HEXAGON_V6_vmpyub_acc_128B(v256, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.128B
+  __builtin_HEXAGON_V6_vmpyubv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.acc.128B
+  __builtin_HEXAGON_V6_vmpyubv_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.128B
+  __builtin_HEXAGON_V6_vmpyuh_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.acc.128B
+  __builtin_HEXAGON_V6_vmpyuh_acc_128B(v256, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuhe.128B
+  __builtin_HEXAGON_V6_vmpyuhe_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuhe.acc.128B
+  __builtin_HEXAGON_V6_vmpyuhe_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.128B
+  __builtin_HEXAGON_V6_vmpyuhv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc.128B
+  __builtin_HEXAGON_V6_vmpyuhv_acc_128B(v256, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmux.128B
+  __builtin_HEXAGON_V6_vmux_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vnavgb.128B
+  __builtin_HEXAGON_V6_vnavgb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vnavgh.128B
+  __builtin_HEXAGON_V6_vnavgh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vnavgub.128B
+  __builtin_HEXAGON_V6_vnavgub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vnavgw.128B
+  __builtin_HEXAGON_V6_vnavgw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vnormamth.128B
+  __builtin_HEXAGON_V6_vnormamth_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vnormamtw.128B
+  __builtin_HEXAGON_V6_vnormamtw_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vnot.128B
+  __builtin_HEXAGON_V6_vnot_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vor.128B
+  __builtin_HEXAGON_V6_vor_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackeb.128B
+  __builtin_HEXAGON_V6_vpackeb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackeh.128B
+  __builtin_HEXAGON_V6_vpackeh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackhb.sat.128B
+  __builtin_HEXAGON_V6_vpackhb_sat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackhub.sat.128B
+  __builtin_HEXAGON_V6_vpackhub_sat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackob.128B
+  __builtin_HEXAGON_V6_vpackob_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackoh.128B
+  __builtin_HEXAGON_V6_vpackoh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackwh.sat.128B
+  __builtin_HEXAGON_V6_vpackwh_sat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpackwuh.sat.128B
+  __builtin_HEXAGON_V6_vpackwuh_sat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vpopcounth.128B
+  __builtin_HEXAGON_V6_vpopcounth_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vprefixqb.128B
+  __builtin_HEXAGON_V6_vprefixqb_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vprefixqh.128B
+  __builtin_HEXAGON_V6_vprefixqh_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vprefixqw.128B
+  __builtin_HEXAGON_V6_vprefixqw_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vrdelta.128B
+  __builtin_HEXAGON_V6_vrdelta_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt.128B
+  __builtin_HEXAGON_V6_vrmpybub_rtt_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt.acc.128B
+  __builtin_HEXAGON_V6_vrmpybub_rtt_acc_128B(v256, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.128B
+  __builtin_HEXAGON_V6_vrmpybus_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.acc.128B
+  __builtin_HEXAGON_V6_vrmpybus_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.128B
+  __builtin_HEXAGON_V6_vrmpybusi_128B(v256, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc.128B
+  __builtin_HEXAGON_V6_vrmpybusi_acc_128B(v256, v256, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.128B
+  __builtin_HEXAGON_V6_vrmpybusv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc.128B
+  __builtin_HEXAGON_V6_vrmpybusv_acc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.128B
+  __builtin_HEXAGON_V6_vrmpybv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.acc.128B
+  __builtin_HEXAGON_V6_vrmpybv_acc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.128B
+  __builtin_HEXAGON_V6_vrmpyub_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.acc.128B
+  __builtin_HEXAGON_V6_vrmpyub_acc_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt.128B
+  __builtin_HEXAGON_V6_vrmpyub_rtt_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt.acc.128B
+  __builtin_HEXAGON_V6_vrmpyub_rtt_acc_128B(v256, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.128B
+  __builtin_HEXAGON_V6_vrmpyubi_128B(v256, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc.128B
+  __builtin_HEXAGON_V6_vrmpyubi_acc_128B(v256, v256, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.128B
+  __builtin_HEXAGON_V6_vrmpyubv_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc.128B
+  __builtin_HEXAGON_V6_vrmpyubv_acc_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vror.128B
+  __builtin_HEXAGON_V6_vror_128B(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vroundhb.128B
+  __builtin_HEXAGON_V6_vroundhb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vroundhub.128B
+  __builtin_HEXAGON_V6_vroundhub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrounduhub.128B
+  __builtin_HEXAGON_V6_vrounduhub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrounduwuh.128B
+  __builtin_HEXAGON_V6_vrounduwuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vroundwh.128B
+  __builtin_HEXAGON_V6_vroundwh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vroundwuh.128B
+  __builtin_HEXAGON_V6_vroundwuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.128B
+  __builtin_HEXAGON_V6_vrsadubi_128B(v256, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.acc.128B
+  __builtin_HEXAGON_V6_vrsadubi_acc_128B(v256, v256, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vsathub.128B
+  __builtin_HEXAGON_V6_vsathub_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsatuwuh.128B
+  __builtin_HEXAGON_V6_vsatuwuh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsatwh.128B
+  __builtin_HEXAGON_V6_vsatwh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsb.128B
+  __builtin_HEXAGON_V6_vsb_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vscattermh.128B
+  __builtin_HEXAGON_V6_vscattermh_128B(0, 0, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermh.add.128B
+  __builtin_HEXAGON_V6_vscattermh_add_128B(0, 0, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermhq.128B
+  __builtin_HEXAGON_V6_vscattermhq_128B(v128, 0, 0, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermhw.128B
+  __builtin_HEXAGON_V6_vscattermhw_128B(0, 0, v256, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermhw.add.128B
+  __builtin_HEXAGON_V6_vscattermhw_add_128B(0, 0, v256, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermhwq.128B
+  __builtin_HEXAGON_V6_vscattermhwq_128B(v128, 0, 0, v256, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermw.128B
+  __builtin_HEXAGON_V6_vscattermw_128B(0, 0, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermw.add.128B
+  __builtin_HEXAGON_V6_vscattermw_add_128B(0, 0, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vscattermwq.128B
+  __builtin_HEXAGON_V6_vscattermwq_128B(v128, 0, 0, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsh.128B
+  __builtin_HEXAGON_V6_vsh_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vshufeh.128B
+  __builtin_HEXAGON_V6_vshufeh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vshuffb.128B
+  __builtin_HEXAGON_V6_vshuffb_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vshuffeb.128B
+  __builtin_HEXAGON_V6_vshuffeb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vshuffh.128B
+  __builtin_HEXAGON_V6_vshuffh_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vshuffob.128B
+  __builtin_HEXAGON_V6_vshuffob_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vshuffvdd.128B
+  __builtin_HEXAGON_V6_vshuffvdd_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vshufoeb.128B
+  __builtin_HEXAGON_V6_vshufoeb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vshufoeh.128B
+  __builtin_HEXAGON_V6_vshufoeh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vshufoh.128B
+  __builtin_HEXAGON_V6_vshufoh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubb.128B
+  __builtin_HEXAGON_V6_vsubb_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubb.dv.128B
+  __builtin_HEXAGON_V6_vsubb_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubbnq.128B
+  __builtin_HEXAGON_V6_vsubbnq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubbq.128B
+  __builtin_HEXAGON_V6_vsubbq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubbsat.128B
+  __builtin_HEXAGON_V6_vsubbsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubbsat.dv.128B
+  __builtin_HEXAGON_V6_vsubbsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubcarry.128B
+  __builtin_HEXAGON_V6_vsubcarry_128B(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vsubh.128B
+  __builtin_HEXAGON_V6_vsubh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubh.dv.128B
+  __builtin_HEXAGON_V6_vsubh_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubhnq.128B
+  __builtin_HEXAGON_V6_vsubhnq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubhq.128B
+  __builtin_HEXAGON_V6_vsubhq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.128B
+  __builtin_HEXAGON_V6_vsubhsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.dv.128B
+  __builtin_HEXAGON_V6_vsubhsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubhw.128B
+  __builtin_HEXAGON_V6_vsubhw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsububh.128B
+  __builtin_HEXAGON_V6_vsububh_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsububsat.128B
+  __builtin_HEXAGON_V6_vsububsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsububsat.dv.128B
+  __builtin_HEXAGON_V6_vsububsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubububb.sat.128B
+  __builtin_HEXAGON_V6_vsubububb_sat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.128B
+  __builtin_HEXAGON_V6_vsubuhsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv.128B
+  __builtin_HEXAGON_V6_vsubuhsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubuhw.128B
+  __builtin_HEXAGON_V6_vsubuhw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubuwsat.128B
+  __builtin_HEXAGON_V6_vsubuwsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubuwsat.dv.128B
+  __builtin_HEXAGON_V6_vsubuwsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubw.128B
+  __builtin_HEXAGON_V6_vsubw_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubw.dv.128B
+  __builtin_HEXAGON_V6_vsubw_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vsubwnq.128B
+  __builtin_HEXAGON_V6_vsubwnq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubwq.128B
+  __builtin_HEXAGON_V6_vsubwq_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.128B
+  __builtin_HEXAGON_V6_vsubwsat_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.dv.128B
+  __builtin_HEXAGON_V6_vsubwsat_dv_128B(v256, v256);
+  // CHECK: @llvm.hexagon.V6.vswap.128B
+  __builtin_HEXAGON_V6_vswap_128B(v128, v128, v128);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.128B
+  __builtin_HEXAGON_V6_vtmpyb_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.acc.128B
+  __builtin_HEXAGON_V6_vtmpyb_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.128B
+  __builtin_HEXAGON_V6_vtmpybus_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.acc.128B
+  __builtin_HEXAGON_V6_vtmpybus_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.128B
+  __builtin_HEXAGON_V6_vtmpyhb_128B(v256, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc.128B
+  __builtin_HEXAGON_V6_vtmpyhb_acc_128B(v256, v256, 0);
+  // CHECK: @llvm.hexagon.V6.vunpackb.128B
+  __builtin_HEXAGON_V6_vunpackb_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vunpackh.128B
+  __builtin_HEXAGON_V6_vunpackh_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vunpackob.128B
+  __builtin_HEXAGON_V6_vunpackob_128B(v256, v128);
+  // CHECK: @llvm.hexagon.V6.vunpackoh.128B
+  __builtin_HEXAGON_V6_vunpackoh_128B(v256, v128);
+  // CHECK: @llvm.hexagon.V6.vunpackub.128B
+  __builtin_HEXAGON_V6_vunpackub_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vunpackuh.128B
+  __builtin_HEXAGON_V6_vunpackuh_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vxor.128B
+  __builtin_HEXAGON_V6_vxor_128B(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vzb.128B
+  __builtin_HEXAGON_V6_vzb_128B(v128);
+  // CHECK: @llvm.hexagon.V6.vzh.128B
+  __builtin_HEXAGON_V6_vzh_128B(v128);
+}
diff --git a/test/CodeGen/builtins-hvx64.c b/test/CodeGen/builtins-hvx64.c
new file mode 100644
index 000000000000..5a53296e7276
--- /dev/null
+++ b/test/CodeGen/builtins-hvx64.c
@@ -0,0 +1,802 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -target-feature +hvx-length64b -emit-llvm %s -o - | FileCheck %s
+
+void test() {
+  int v64 __attribute__((__vector_size__(64)));
+  int v128 __attribute__((__vector_size__(128)));
+
+  // CHECK: @llvm.hexagon.V6.extractw
+  __builtin_HEXAGON_V6_extractw(v64, 0);
+  // CHECK: @llvm.hexagon.V6.hi
+  __builtin_HEXAGON_V6_hi(v128);
+  // CHECK: @llvm.hexagon.V6.lo
+  __builtin_HEXAGON_V6_lo(v128);
+  // CHECK: @llvm.hexagon.V6.lvsplatb
+  __builtin_HEXAGON_V6_lvsplatb(0);
+  // CHECK: @llvm.hexagon.V6.lvsplath
+  __builtin_HEXAGON_V6_lvsplath(0);
+  // CHECK: @llvm.hexagon.V6.lvsplatw
+  __builtin_HEXAGON_V6_lvsplatw(0);
+  // CHECK: @llvm.hexagon.V6.pred.and
+  __builtin_HEXAGON_V6_pred_and(v64, v64);
+  // CHECK: @llvm.hexagon.V6.pred.and.n
+  __builtin_HEXAGON_V6_pred_and_n(v64, v64);
+  // CHECK: @llvm.hexagon.V6.pred.not
+  __builtin_HEXAGON_V6_pred_not(v64);
+  // CHECK: @llvm.hexagon.V6.pred.or
+  __builtin_HEXAGON_V6_pred_or(v64, v64);
+  // CHECK: @llvm.hexagon.V6.pred.or.n
+  __builtin_HEXAGON_V6_pred_or_n(v64, v64);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2
+  __builtin_HEXAGON_V6_pred_scalar2(0);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2v2
+  __builtin_HEXAGON_V6_pred_scalar2v2(0);
+  // CHECK: @llvm.hexagon.V6.pred.xor
+  __builtin_HEXAGON_V6_pred_xor(v64, v64);
+  // CHECK: @llvm.hexagon.V6.shuffeqh
+  __builtin_HEXAGON_V6_shuffeqh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.shuffeqw
+  __builtin_HEXAGON_V6_shuffeqw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vS32b.nqpred.ai
+  __builtin_HEXAGON_V6_vS32b_nqpred_ai(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vS32b.nt.nqpred.ai
+  __builtin_HEXAGON_V6_vS32b_nt_nqpred_ai(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vS32b.nt.qpred.ai
+  __builtin_HEXAGON_V6_vS32b_nt_qpred_ai(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vS32b.qpred.ai
+  __builtin_HEXAGON_V6_vS32b_qpred_ai(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vabsb
+  __builtin_HEXAGON_V6_vabsb(v64);
+  // CHECK: @llvm.hexagon.V6.vabsb.sat
+  __builtin_HEXAGON_V6_vabsb_sat(v64);
+  // CHECK: @llvm.hexagon.V6.vabsdiffh
+  __builtin_HEXAGON_V6_vabsdiffh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vabsdiffub
+  __builtin_HEXAGON_V6_vabsdiffub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vabsdiffuh
+  __builtin_HEXAGON_V6_vabsdiffuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vabsdiffw
+  __builtin_HEXAGON_V6_vabsdiffw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vabsh
+  __builtin_HEXAGON_V6_vabsh(v64);
+  // CHECK: @llvm.hexagon.V6.vabsh.sat
+  __builtin_HEXAGON_V6_vabsh_sat(v64);
+  // CHECK: @llvm.hexagon.V6.vabsw
+  __builtin_HEXAGON_V6_vabsw(v64);
+  // CHECK: @llvm.hexagon.V6.vabsw.sat
+  __builtin_HEXAGON_V6_vabsw_sat(v64);
+  // CHECK: @llvm.hexagon.V6.vaddb
+  __builtin_HEXAGON_V6_vaddb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddb.dv
+  __builtin_HEXAGON_V6_vaddb_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddbnq
+  __builtin_HEXAGON_V6_vaddbnq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddbq
+  __builtin_HEXAGON_V6_vaddbq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddbsat
+  __builtin_HEXAGON_V6_vaddbsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddbsat.dv
+  __builtin_HEXAGON_V6_vaddbsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddcarry
+  __builtin_HEXAGON_V6_vaddcarry(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vaddclbh
+  __builtin_HEXAGON_V6_vaddclbh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddclbw
+  __builtin_HEXAGON_V6_vaddclbw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddh
+  __builtin_HEXAGON_V6_vaddh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddh.dv
+  __builtin_HEXAGON_V6_vaddh_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddhnq
+  __builtin_HEXAGON_V6_vaddhnq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddhq
+  __builtin_HEXAGON_V6_vaddhq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddhsat
+  __builtin_HEXAGON_V6_vaddhsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.dv
+  __builtin_HEXAGON_V6_vaddhsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddhw
+  __builtin_HEXAGON_V6_vaddhw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddhw.acc
+  __builtin_HEXAGON_V6_vaddhw_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddubh
+  __builtin_HEXAGON_V6_vaddubh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddubh.acc
+  __builtin_HEXAGON_V6_vaddubh_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddubsat
+  __builtin_HEXAGON_V6_vaddubsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.dv
+  __builtin_HEXAGON_V6_vaddubsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddububb.sat
+  __builtin_HEXAGON_V6_vaddububb_sat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vadduhsat
+  __builtin_HEXAGON_V6_vadduhsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.dv
+  __builtin_HEXAGON_V6_vadduhsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vadduhw
+  __builtin_HEXAGON_V6_vadduhw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vadduhw.acc
+  __builtin_HEXAGON_V6_vadduhw_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vadduwsat
+  __builtin_HEXAGON_V6_vadduwsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vadduwsat.dv
+  __builtin_HEXAGON_V6_vadduwsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddw
+  __builtin_HEXAGON_V6_vaddw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddw.dv
+  __builtin_HEXAGON_V6_vaddw_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vaddwnq
+  __builtin_HEXAGON_V6_vaddwnq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddwq
+  __builtin_HEXAGON_V6_vaddwq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddwsat
+  __builtin_HEXAGON_V6_vaddwsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.dv
+  __builtin_HEXAGON_V6_vaddwsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.valignb
+  __builtin_HEXAGON_V6_valignb(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.valignbi
+  __builtin_HEXAGON_V6_valignbi(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vand
+  __builtin_HEXAGON_V6_vand(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vandnqrt
+  __builtin_HEXAGON_V6_vandnqrt(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vandnqrt.acc
+  __builtin_HEXAGON_V6_vandnqrt_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt
+  __builtin_HEXAGON_V6_vandqrt(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.acc
+  __builtin_HEXAGON_V6_vandqrt_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vandvnqv
+  __builtin_HEXAGON_V6_vandvnqv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vandvqv
+  __builtin_HEXAGON_V6_vandvqv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vandvrt
+  __builtin_HEXAGON_V6_vandvrt(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.acc
+  __builtin_HEXAGON_V6_vandvrt_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh
+  __builtin_HEXAGON_V6_vaslh(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh.acc
+  __builtin_HEXAGON_V6_vaslh_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vaslhv
+  __builtin_HEXAGON_V6_vaslhv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaslw
+  __builtin_HEXAGON_V6_vaslw(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.acc
+  __builtin_HEXAGON_V6_vaslw_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vaslwv
+  __builtin_HEXAGON_V6_vaslwv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vasrh
+  __builtin_HEXAGON_V6_vasrh(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrh.acc
+  __builtin_HEXAGON_V6_vasrh_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbrndsat
+  __builtin_HEXAGON_V6_vasrhbrndsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbsat
+  __builtin_HEXAGON_V6_vasrhbsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubrndsat
+  __builtin_HEXAGON_V6_vasrhubrndsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubsat
+  __builtin_HEXAGON_V6_vasrhubsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhv
+  __builtin_HEXAGON_V6_vasrhv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vasruhubrndsat
+  __builtin_HEXAGON_V6_vasruhubrndsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasruhubsat
+  __builtin_HEXAGON_V6_vasruhubsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasruwuhrndsat
+  __builtin_HEXAGON_V6_vasruwuhrndsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasruwuhsat
+  __builtin_HEXAGON_V6_vasruwuhsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw
+  __builtin_HEXAGON_V6_vasrw(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.acc
+  __builtin_HEXAGON_V6_vasrw_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwh
+  __builtin_HEXAGON_V6_vasrwh(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhrndsat
+  __builtin_HEXAGON_V6_vasrwhrndsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhsat
+  __builtin_HEXAGON_V6_vasrwhsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhrndsat
+  __builtin_HEXAGON_V6_vasrwuhrndsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhsat
+  __builtin_HEXAGON_V6_vasrwuhsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwv
+  __builtin_HEXAGON_V6_vasrwv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vassign
+  __builtin_HEXAGON_V6_vassign(v64);
+  // CHECK: @llvm.hexagon.V6.vassignp
+  __builtin_HEXAGON_V6_vassignp(v128);
+  // CHECK: @llvm.hexagon.V6.vavgb
+  __builtin_HEXAGON_V6_vavgb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavgbrnd
+  __builtin_HEXAGON_V6_vavgbrnd(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavgh
+  __builtin_HEXAGON_V6_vavgh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavghrnd
+  __builtin_HEXAGON_V6_vavghrnd(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavgub
+  __builtin_HEXAGON_V6_vavgub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavgubrnd
+  __builtin_HEXAGON_V6_vavgubrnd(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavguh
+  __builtin_HEXAGON_V6_vavguh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavguhrnd
+  __builtin_HEXAGON_V6_vavguhrnd(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavguw
+  __builtin_HEXAGON_V6_vavguw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavguwrnd
+  __builtin_HEXAGON_V6_vavguwrnd(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavgw
+  __builtin_HEXAGON_V6_vavgw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vavgwrnd
+  __builtin_HEXAGON_V6_vavgwrnd(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vcl0h
+  __builtin_HEXAGON_V6_vcl0h(v64);
+  // CHECK: @llvm.hexagon.V6.vcl0w
+  __builtin_HEXAGON_V6_vcl0w(v64);
+  // CHECK: @llvm.hexagon.V6.vcombine
+  __builtin_HEXAGON_V6_vcombine(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vd0
+  __builtin_HEXAGON_V6_vd0();
+  // CHECK: @llvm.hexagon.V6.vdd0
+  __builtin_HEXAGON_V6_vdd0();
+  // CHECK: @llvm.hexagon.V6.vdealb
+  __builtin_HEXAGON_V6_vdealb(v64);
+  // CHECK: @llvm.hexagon.V6.vdealb4w
+  __builtin_HEXAGON_V6_vdealb4w(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vdealh
+  __builtin_HEXAGON_V6_vdealh(v64);
+  // CHECK: @llvm.hexagon.V6.vdealvdd
+  __builtin_HEXAGON_V6_vdealvdd(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdelta
+  __builtin_HEXAGON_V6_vdelta(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vdmpybus
+  __builtin_HEXAGON_V6_vdmpybus(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.acc
+  __builtin_HEXAGON_V6_vdmpybus_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv
+  __builtin_HEXAGON_V6_vdmpybus_dv(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc
+  __builtin_HEXAGON_V6_vdmpybus_dv_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb
+  __builtin_HEXAGON_V6_vdmpyhb(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc
+  __builtin_HEXAGON_V6_vdmpyhb_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv
+  __builtin_HEXAGON_V6_vdmpyhb_dv(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc
+  __builtin_HEXAGON_V6_vdmpyhb_dv_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat
+  __builtin_HEXAGON_V6_vdmpyhisat(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc
+  __builtin_HEXAGON_V6_vdmpyhisat_acc(v64, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat
+  __builtin_HEXAGON_V6_vdmpyhsat(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc
+  __builtin_HEXAGON_V6_vdmpyhsat_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat
+  __builtin_HEXAGON_V6_vdmpyhsuisat(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc
+  __builtin_HEXAGON_V6_vdmpyhsuisat_acc(v64, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat
+  __builtin_HEXAGON_V6_vdmpyhsusat(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc
+  __builtin_HEXAGON_V6_vdmpyhsusat_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat
+  __builtin_HEXAGON_V6_vdmpyhvsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc
+  __builtin_HEXAGON_V6_vdmpyhvsat_acc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vdsaduh
+  __builtin_HEXAGON_V6_vdsaduh(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.acc
+  __builtin_HEXAGON_V6_vdsaduh_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.veqb
+  __builtin_HEXAGON_V6_veqb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqb.and
+  __builtin_HEXAGON_V6_veqb_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqb.or
+  __builtin_HEXAGON_V6_veqb_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqb.xor
+  __builtin_HEXAGON_V6_veqb_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqh
+  __builtin_HEXAGON_V6_veqh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqh.and
+  __builtin_HEXAGON_V6_veqh_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqh.or
+  __builtin_HEXAGON_V6_veqh_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqh.xor
+  __builtin_HEXAGON_V6_veqh_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqw
+  __builtin_HEXAGON_V6_veqw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqw.and
+  __builtin_HEXAGON_V6_veqw_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqw.or
+  __builtin_HEXAGON_V6_veqw_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.veqw.xor
+  __builtin_HEXAGON_V6_veqw_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgathermh
+  __builtin_HEXAGON_V6_vgathermh(0, 0, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vgathermhq
+  __builtin_HEXAGON_V6_vgathermhq(0, v64, 0, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vgathermhw
+  __builtin_HEXAGON_V6_vgathermhw(0, 0, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vgathermhwq
+  __builtin_HEXAGON_V6_vgathermhwq(0, v64, 0, 0, v128);
+  // CHECK: @llvm.hexagon.V6.vgathermw
+  __builtin_HEXAGON_V6_vgathermw(0, 0, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vgathermwq
+  __builtin_HEXAGON_V6_vgathermwq(0, v64, 0, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vgtb
+  __builtin_HEXAGON_V6_vgtb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtb.and
+  __builtin_HEXAGON_V6_vgtb_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtb.or
+  __builtin_HEXAGON_V6_vgtb_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtb.xor
+  __builtin_HEXAGON_V6_vgtb_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgth
+  __builtin_HEXAGON_V6_vgth(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgth.and
+  __builtin_HEXAGON_V6_vgth_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgth.or
+  __builtin_HEXAGON_V6_vgth_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgth.xor
+  __builtin_HEXAGON_V6_vgth_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtub
+  __builtin_HEXAGON_V6_vgtub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtub.and
+  __builtin_HEXAGON_V6_vgtub_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtub.or
+  __builtin_HEXAGON_V6_vgtub_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtub.xor
+  __builtin_HEXAGON_V6_vgtub_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuh
+  __builtin_HEXAGON_V6_vgtuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuh.and
+  __builtin_HEXAGON_V6_vgtuh_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuh.or
+  __builtin_HEXAGON_V6_vgtuh_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuh.xor
+  __builtin_HEXAGON_V6_vgtuh_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuw
+  __builtin_HEXAGON_V6_vgtuw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuw.and
+  __builtin_HEXAGON_V6_vgtuw_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuw.or
+  __builtin_HEXAGON_V6_vgtuw_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtuw.xor
+  __builtin_HEXAGON_V6_vgtuw_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtw
+  __builtin_HEXAGON_V6_vgtw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtw.and
+  __builtin_HEXAGON_V6_vgtw_and(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtw.or
+  __builtin_HEXAGON_V6_vgtw_or(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vgtw.xor
+  __builtin_HEXAGON_V6_vgtw_xor(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vinsertwr
+  __builtin_HEXAGON_V6_vinsertwr(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignb
+  __builtin_HEXAGON_V6_vlalignb(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignbi
+  __builtin_HEXAGON_V6_vlalignbi(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrb
+  __builtin_HEXAGON_V6_vlsrb(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrh
+  __builtin_HEXAGON_V6_vlsrh(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrhv
+  __builtin_HEXAGON_V6_vlsrhv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vlsrw
+  __builtin_HEXAGON_V6_vlsrw(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrwv
+  __builtin_HEXAGON_V6_vlsrwv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vlut4
+  __builtin_HEXAGON_V6_vlut4(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb
+  __builtin_HEXAGON_V6_vlutvvb(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.nm
+  __builtin_HEXAGON_V6_vlutvvb_nm(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc
+  __builtin_HEXAGON_V6_vlutvvb_oracc(v64, v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracci
+  __builtin_HEXAGON_V6_vlutvvb_oracci(v64, v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvbi
+  __builtin_HEXAGON_V6_vlutvvbi(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh
+  __builtin_HEXAGON_V6_vlutvwh(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.nm
+  __builtin_HEXAGON_V6_vlutvwh_nm(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc
+  __builtin_HEXAGON_V6_vlutvwh_oracc(v128, v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracci
+  __builtin_HEXAGON_V6_vlutvwh_oracci(v128, v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwhi
+  __builtin_HEXAGON_V6_vlutvwhi(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmaskedstorenq
+  __builtin_HEXAGON_V6_vmaskedstorenq(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vmaskedstorentnq
+  __builtin_HEXAGON_V6_vmaskedstorentnq(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vmaskedstorentq
+  __builtin_HEXAGON_V6_vmaskedstorentq(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vmaskedstoreq
+  __builtin_HEXAGON_V6_vmaskedstoreq(v64, 0, v64);
+  // CHECK: @llvm.hexagon.V6.vmaxb
+  __builtin_HEXAGON_V6_vmaxb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmaxh
+  __builtin_HEXAGON_V6_vmaxh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmaxub
+  __builtin_HEXAGON_V6_vmaxub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmaxuh
+  __builtin_HEXAGON_V6_vmaxuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmaxw
+  __builtin_HEXAGON_V6_vmaxw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vminb
+  __builtin_HEXAGON_V6_vminb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vminh
+  __builtin_HEXAGON_V6_vminh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vminub
+  __builtin_HEXAGON_V6_vminub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vminuh
+  __builtin_HEXAGON_V6_vminuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vminw
+  __builtin_HEXAGON_V6_vminw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpabus
+  __builtin_HEXAGON_V6_vmpabus(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.acc
+  __builtin_HEXAGON_V6_vmpabus_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabusv
+  __builtin_HEXAGON_V6_vmpabusv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpabuu
+  __builtin_HEXAGON_V6_vmpabuu(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabuu.acc
+  __builtin_HEXAGON_V6_vmpabuu_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabuuv
+  __builtin_HEXAGON_V6_vmpabuuv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vmpahb
+  __builtin_HEXAGON_V6_vmpahb(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.acc
+  __builtin_HEXAGON_V6_vmpahb_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahhsat
+  __builtin_HEXAGON_V6_vmpahhsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpauhb
+  __builtin_HEXAGON_V6_vmpauhb(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpauhb.acc
+  __builtin_HEXAGON_V6_vmpauhb_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vmpauhuhsat
+  __builtin_HEXAGON_V6_vmpauhuhsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpsuhuhsat
+  __builtin_HEXAGON_V6_vmpsuhuhsat(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus
+  __builtin_HEXAGON_V6_vmpybus(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.acc
+  __builtin_HEXAGON_V6_vmpybus_acc(v128, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybusv
+  __builtin_HEXAGON_V6_vmpybusv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.acc
+  __builtin_HEXAGON_V6_vmpybusv_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpybv
+  __builtin_HEXAGON_V6_vmpybv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpybv.acc
+  __builtin_HEXAGON_V6_vmpybv_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh
+  __builtin_HEXAGON_V6_vmpyewuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh.64
+  __builtin_HEXAGON_V6_vmpyewuh_64(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyh
+  __builtin_HEXAGON_V6_vmpyh(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyh.acc
+  __builtin_HEXAGON_V6_vmpyh_acc(v128, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc
+  __builtin_HEXAGON_V6_vmpyhsat_acc(v128, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsrs
+  __builtin_HEXAGON_V6_vmpyhsrs(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhss
+  __builtin_HEXAGON_V6_vmpyhss(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhus
+  __builtin_HEXAGON_V6_vmpyhus(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.acc
+  __builtin_HEXAGON_V6_vmpyhus_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyhv
+  __builtin_HEXAGON_V6_vmpyhv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.acc
+  __builtin_HEXAGON_V6_vmpyhv_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyhvsrs
+  __builtin_HEXAGON_V6_vmpyhvsrs(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyieoh
+  __builtin_HEXAGON_V6_vmpyieoh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc
+  __builtin_HEXAGON_V6_vmpyiewh_acc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh
+  __builtin_HEXAGON_V6_vmpyiewuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc
+  __builtin_HEXAGON_V6_vmpyiewuh_acc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyih
+  __builtin_HEXAGON_V6_vmpyih(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyih.acc
+  __builtin_HEXAGON_V6_vmpyih_acc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyihb
+  __builtin_HEXAGON_V6_vmpyihb(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.acc
+  __builtin_HEXAGON_V6_vmpyihb_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiowh
+  __builtin_HEXAGON_V6_vmpyiowh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb
+  __builtin_HEXAGON_V6_vmpyiwb(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc
+  __builtin_HEXAGON_V6_vmpyiwb_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh
+  __builtin_HEXAGON_V6_vmpyiwh(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc
+  __builtin_HEXAGON_V6_vmpyiwh_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwub
+  __builtin_HEXAGON_V6_vmpyiwub(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwub.acc
+  __builtin_HEXAGON_V6_vmpyiwub_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyowh
+  __builtin_HEXAGON_V6_vmpyowh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.64.acc
+  __builtin_HEXAGON_V6_vmpyowh_64_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd
+  __builtin_HEXAGON_V6_vmpyowh_rnd(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc
+  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc
+  __builtin_HEXAGON_V6_vmpyowh_sacc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyub
+  __builtin_HEXAGON_V6_vmpyub(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.acc
+  __builtin_HEXAGON_V6_vmpyub_acc(v128, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyubv
+  __builtin_HEXAGON_V6_vmpyubv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.acc
+  __builtin_HEXAGON_V6_vmpyubv_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyuh
+  __builtin_HEXAGON_V6_vmpyuh(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.acc
+  __builtin_HEXAGON_V6_vmpyuh_acc(v128, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuhe
+  __builtin_HEXAGON_V6_vmpyuhe(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuhe.acc
+  __builtin_HEXAGON_V6_vmpyuhe_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv
+  __builtin_HEXAGON_V6_vmpyuhv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc
+  __builtin_HEXAGON_V6_vmpyuhv_acc(v128, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmux
+  __builtin_HEXAGON_V6_vmux(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vnavgb
+  __builtin_HEXAGON_V6_vnavgb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vnavgh
+  __builtin_HEXAGON_V6_vnavgh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vnavgub
+  __builtin_HEXAGON_V6_vnavgub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vnavgw
+  __builtin_HEXAGON_V6_vnavgw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vnormamth
+  __builtin_HEXAGON_V6_vnormamth(v64);
+  // CHECK: @llvm.hexagon.V6.vnormamtw
+  __builtin_HEXAGON_V6_vnormamtw(v64);
+  // CHECK: @llvm.hexagon.V6.vnot
+  __builtin_HEXAGON_V6_vnot(v64);
+  // CHECK: @llvm.hexagon.V6.vor
+  __builtin_HEXAGON_V6_vor(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackeb
+  __builtin_HEXAGON_V6_vpackeb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackeh
+  __builtin_HEXAGON_V6_vpackeh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackhb.sat
+  __builtin_HEXAGON_V6_vpackhb_sat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackhub.sat
+  __builtin_HEXAGON_V6_vpackhub_sat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackob
+  __builtin_HEXAGON_V6_vpackob(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackoh
+  __builtin_HEXAGON_V6_vpackoh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackwh.sat
+  __builtin_HEXAGON_V6_vpackwh_sat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpackwuh.sat
+  __builtin_HEXAGON_V6_vpackwuh_sat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vpopcounth
+  __builtin_HEXAGON_V6_vpopcounth(v64);
+  // CHECK: @llvm.hexagon.V6.vprefixqb
+  __builtin_HEXAGON_V6_vprefixqb(v64);
+  // CHECK: @llvm.hexagon.V6.vprefixqh
+  __builtin_HEXAGON_V6_vprefixqh(v64);
+  // CHECK: @llvm.hexagon.V6.vprefixqw
+  __builtin_HEXAGON_V6_vprefixqw(v64);
+  // CHECK: @llvm.hexagon.V6.vrdelta
+  __builtin_HEXAGON_V6_vrdelta(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt
+  __builtin_HEXAGON_V6_vrmpybub_rtt(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybub.rtt.acc
+  __builtin_HEXAGON_V6_vrmpybub_rtt_acc(v128, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus
+  __builtin_HEXAGON_V6_vrmpybus(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.acc
+  __builtin_HEXAGON_V6_vrmpybus_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi
+  __builtin_HEXAGON_V6_vrmpybusi(v128, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc
+  __builtin_HEXAGON_V6_vrmpybusi_acc(v128, v128, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv
+  __builtin_HEXAGON_V6_vrmpybusv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc
+  __builtin_HEXAGON_V6_vrmpybusv_acc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrmpybv
+  __builtin_HEXAGON_V6_vrmpybv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.acc
+  __builtin_HEXAGON_V6_vrmpybv_acc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrmpyub
+  __builtin_HEXAGON_V6_vrmpyub(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.acc
+  __builtin_HEXAGON_V6_vrmpyub_acc(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt
+  __builtin_HEXAGON_V6_vrmpyub_rtt(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.rtt.acc
+  __builtin_HEXAGON_V6_vrmpyub_rtt_acc(v128, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi
+  __builtin_HEXAGON_V6_vrmpyubi(v128, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc
+  __builtin_HEXAGON_V6_vrmpyubi_acc(v128, v128, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv
+  __builtin_HEXAGON_V6_vrmpyubv(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc
+  __builtin_HEXAGON_V6_vrmpyubv_acc(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vror
+  __builtin_HEXAGON_V6_vror(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vroundhb
+  __builtin_HEXAGON_V6_vroundhb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vroundhub
+  __builtin_HEXAGON_V6_vroundhub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrounduhub
+  __builtin_HEXAGON_V6_vrounduhub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrounduwuh
+  __builtin_HEXAGON_V6_vrounduwuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vroundwh
+  __builtin_HEXAGON_V6_vroundwh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vroundwuh
+  __builtin_HEXAGON_V6_vroundwuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vrsadubi
+  __builtin_HEXAGON_V6_vrsadubi(v128, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.acc
+  __builtin_HEXAGON_V6_vrsadubi_acc(v128, v128, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vsathub
+  __builtin_HEXAGON_V6_vsathub(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsatuwuh
+  __builtin_HEXAGON_V6_vsatuwuh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsatwh
+  __builtin_HEXAGON_V6_vsatwh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsb
+  __builtin_HEXAGON_V6_vsb(v64);
+  // CHECK: @llvm.hexagon.V6.vscattermh
+  __builtin_HEXAGON_V6_vscattermh(0, 0, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermh.add
+  __builtin_HEXAGON_V6_vscattermh_add(0, 0, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermhq
+  __builtin_HEXAGON_V6_vscattermhq(v64, 0, 0, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermhw
+  __builtin_HEXAGON_V6_vscattermhw(0, 0, v128, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermhw.add
+  __builtin_HEXAGON_V6_vscattermhw_add(0, 0, v128, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermhwq
+  __builtin_HEXAGON_V6_vscattermhwq(v64, 0, 0, v128, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermw
+  __builtin_HEXAGON_V6_vscattermw(0, 0, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermw.add
+  __builtin_HEXAGON_V6_vscattermw_add(0, 0, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vscattermwq
+  __builtin_HEXAGON_V6_vscattermwq(v64, 0, 0, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsh
+  __builtin_HEXAGON_V6_vsh(v64);
+  // CHECK: @llvm.hexagon.V6.vshufeh
+  __builtin_HEXAGON_V6_vshufeh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vshuffb
+  __builtin_HEXAGON_V6_vshuffb(v64);
+  // CHECK: @llvm.hexagon.V6.vshuffeb
+  __builtin_HEXAGON_V6_vshuffeb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vshuffh
+  __builtin_HEXAGON_V6_vshuffh(v64);
+  // CHECK: @llvm.hexagon.V6.vshuffob
+  __builtin_HEXAGON_V6_vshuffob(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vshuffvdd
+  __builtin_HEXAGON_V6_vshuffvdd(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vshufoeb
+  __builtin_HEXAGON_V6_vshufoeb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vshufoeh
+  __builtin_HEXAGON_V6_vshufoeh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vshufoh
+  __builtin_HEXAGON_V6_vshufoh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubb
+  __builtin_HEXAGON_V6_vsubb(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubb.dv
+  __builtin_HEXAGON_V6_vsubb_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubbnq
+  __builtin_HEXAGON_V6_vsubbnq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubbq
+  __builtin_HEXAGON_V6_vsubbq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubbsat
+  __builtin_HEXAGON_V6_vsubbsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubbsat.dv
+  __builtin_HEXAGON_V6_vsubbsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubcarry
+  __builtin_HEXAGON_V6_vsubcarry(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vsubh
+  __builtin_HEXAGON_V6_vsubh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubh.dv
+  __builtin_HEXAGON_V6_vsubh_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubhnq
+  __builtin_HEXAGON_V6_vsubhnq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubhq
+  __builtin_HEXAGON_V6_vsubhq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubhsat
+  __builtin_HEXAGON_V6_vsubhsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.dv
+  __builtin_HEXAGON_V6_vsubhsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubhw
+  __builtin_HEXAGON_V6_vsubhw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsububh
+  __builtin_HEXAGON_V6_vsububh(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsububsat
+  __builtin_HEXAGON_V6_vsububsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsububsat.dv
+  __builtin_HEXAGON_V6_vsububsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubububb.sat
+  __builtin_HEXAGON_V6_vsubububb_sat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat
+  __builtin_HEXAGON_V6_vsubuhsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv
+  __builtin_HEXAGON_V6_vsubuhsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubuhw
+  __builtin_HEXAGON_V6_vsubuhw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubuwsat
+  __builtin_HEXAGON_V6_vsubuwsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubuwsat.dv
+  __builtin_HEXAGON_V6_vsubuwsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubw
+  __builtin_HEXAGON_V6_vsubw(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubw.dv
+  __builtin_HEXAGON_V6_vsubw_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vsubwnq
+  __builtin_HEXAGON_V6_vsubwnq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubwq
+  __builtin_HEXAGON_V6_vsubwq(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubwsat
+  __builtin_HEXAGON_V6_vsubwsat(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.dv
+  __builtin_HEXAGON_V6_vsubwsat_dv(v128, v128);
+  // CHECK: @llvm.hexagon.V6.vswap
+  __builtin_HEXAGON_V6_vswap(v64, v64, v64);
+  // CHECK: @llvm.hexagon.V6.vtmpyb
+  __builtin_HEXAGON_V6_vtmpyb(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.acc
+  __builtin_HEXAGON_V6_vtmpyb_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus
+  __builtin_HEXAGON_V6_vtmpybus(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.acc
+  __builtin_HEXAGON_V6_vtmpybus_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb
+  __builtin_HEXAGON_V6_vtmpyhb(v128, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc
+  __builtin_HEXAGON_V6_vtmpyhb_acc(v128, v128, 0);
+  // CHECK: @llvm.hexagon.V6.vunpackb
+  __builtin_HEXAGON_V6_vunpackb(v64);
+  // CHECK: @llvm.hexagon.V6.vunpackh
+  __builtin_HEXAGON_V6_vunpackh(v64);
+  // CHECK: @llvm.hexagon.V6.vunpackob
+  __builtin_HEXAGON_V6_vunpackob(v128, v64);
+  // CHECK: @llvm.hexagon.V6.vunpackoh
+  __builtin_HEXAGON_V6_vunpackoh(v128, v64);
+  // CHECK: @llvm.hexagon.V6.vunpackub
+  __builtin_HEXAGON_V6_vunpackub(v64);
+  // CHECK: @llvm.hexagon.V6.vunpackuh
+  __builtin_HEXAGON_V6_vunpackuh(v64);
+  // CHECK: @llvm.hexagon.V6.vxor
+  __builtin_HEXAGON_V6_vxor(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vzb
+  __builtin_HEXAGON_V6_vzb(v64);
+  // CHECK: @llvm.hexagon.V6.vzh
+  __builtin_HEXAGON_V6_vzh(v64);
+}
diff --git a/test/CodeGen/builtins-mips-args.c b/test/CodeGen/builtins-mips-args.c
index fd3e31443ecf..cdb42af4a53d 100644
--- a/test/CodeGen/builtins-mips-args.c
+++ b/test/CodeGen/builtins-mips-args.c
@@ -7,10 +7,10 @@ void foo() {
   int a = 3;
   __builtin_mips_wrdsp(2052, a);  // expected-error{{argument to '__builtin_mips_wrdsp' must be a constant integer}}
   __builtin_mips_rddsp(a);        // expected-error{{argument to '__builtin_mips_rddsp' must be a constant integer}}
-  __builtin_mips_wrdsp(2052, -1); // expected-error{{argument should be a value from 0 to 63}}
-  __builtin_mips_rddsp(-1);       // expected-error{{argument should be a value from 0 to 63}}
-  __builtin_mips_wrdsp(2052, 64); // expected-error{{argument should be a value from 0 to 63}}
-  __builtin_mips_rddsp(64);       // expected-error{{argument should be a value from 0 to 63}}
+  __builtin_mips_wrdsp(2052, -1); // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_rddsp(-1);       // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_wrdsp(2052, 64); // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_rddsp(64);       // expected-error-re{{argument value {{.*}} is outside the valid range}}
 
   // MIPS DSP Rev 2
 
@@ -20,18 +20,18 @@ void foo() {
   __builtin_mips_precr_sra_r_ph_w(1, 2, a); // expected-error{{argument to '__builtin_mips_precr_sra_r_ph_w' must be a constant integer}}
   __builtin_mips_prepend(1, 2, a);          // expected-error{{argument to '__builtin_mips_prepend' must be a constant integer}}
 
-  __builtin_mips_append(1, 2, -1);  // expected-error{{argument should be a value from 0 to 31}}
-  __builtin_mips_append(1, 2, 32);  // expected-error{{argument should be a value from 0 to 31}}
+  __builtin_mips_append(1, 2, -1);  // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_append(1, 2, 32);  // expected-error-re{{argument value {{.*}} is outside the valid range}}
 
-  __builtin_mips_balign(1, 2, -1);  // expected-error{{argument should be a value from 0 to 3}}
-  __builtin_mips_balign(1, 2, 4);   // expected-error{{argument should be a value from 0 to 3}}
+  __builtin_mips_balign(1, 2, -1);  // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_balign(1, 2, 4);   // expected-error-re{{argument value {{.*}} is outside the valid range}}
 
-  __builtin_mips_precr_sra_ph_w(1, 2, -1);  // expected-error{{argument should be a value from 0 to 31}}
-  __builtin_mips_precr_sra_ph_w(1, 2, 32);  // expected-error{{argument should be a value from 0 to 31}}
+  __builtin_mips_precr_sra_ph_w(1, 2, -1);  // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_precr_sra_ph_w(1, 2, 32);  // expected-error-re{{argument value {{.*}} is outside the valid range}}
 
-  __builtin_mips_precr_sra_r_ph_w(1, 2, -1);  // expected-error{{argument should be a value from 0 to 31}}
-  __builtin_mips_precr_sra_r_ph_w(1, 2, 32);  // expected-error{{argument should be a value from 0 to 31}}
+  __builtin_mips_precr_sra_r_ph_w(1, 2, -1);  // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_precr_sra_r_ph_w(1, 2, 32);  // expected-error-re{{argument value {{.*}} is outside the valid range}}
 
-  __builtin_mips_prepend(1, 2, -1); // expected-error{{argument should be a value from 0 to 31}}
-  __builtin_mips_prepend(1, 2, -1); // expected-error{{argument should be a value from 0 to 31}}
+  __builtin_mips_prepend(1, 2, -1); // expected-error-re{{argument value {{.*}} is outside the valid range}}
+  __builtin_mips_prepend(1, 2, -1); // expected-error-re{{argument value {{.*}} is outside the valid range}}
 }
diff --git a/test/CodeGen/builtins-ms.c b/test/CodeGen/builtins-ms.c
index a33b57de8ddf..915cc17e002c 100644
--- a/test/CodeGen/builtins-ms.c
+++ b/test/CodeGen/builtins-ms.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - -fms-extensions -triple i686-pc-win32 | FileCheck %s
 
-// CHECK-LABEL: define void @test_alloca(
+// CHECK-LABEL: define dso_local void @test_alloca(
 void capture(void *);
 void test_alloca(int n) {
   capture(_alloca(n));
@@ -8,7 +8,7 @@ void test_alloca(int n) {
   // CHECK: call void @capture(i8* %[[arg]])
 }
 
-// CHECK-LABEL: define void @test_alloca_with_align(
+// CHECK-LABEL: define dso_local void @test_alloca_with_align(
 void test_alloca_with_align(int n) {
   capture(__builtin_alloca_with_align(n, 64));
   // CHECK: %[[arg:.*]] = alloca i8, i32 %{{.*}}, align 8
diff --git a/test/CodeGen/builtins-nvptx-ptx50.cu b/test/CodeGen/builtins-nvptx-ptx50.cu
index e85be442eb47..72e1aecb4870 100644
--- a/test/CodeGen/builtins-nvptx-ptx50.cu
+++ b/test/CodeGen/builtins-nvptx-ptx50.cu
@@ -18,6 +18,6 @@
 // CHECK-LABEL: test_fn
 __device__ void test_fn(double d, double* double_ptr) {
   // CHECK: call double @llvm.nvvm.atomic.load.add.f64.p0f64
-  // expected-error@+1 {{'__nvvm_atom_add_gen_d' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_add_gen_d' needs target feature sm_60}}
   __nvvm_atom_add_gen_d(double_ptr, d);
 }
diff --git a/test/CodeGen/builtins-nvptx-sm_70.cu b/test/CodeGen/builtins-nvptx-sm_70.cu
index 09e5b6ba7a7d..66fa1b5d6386 100644
--- a/test/CodeGen/builtins-nvptx-sm_70.cu
+++ b/test/CodeGen/builtins-nvptx-sm_70.cu
@@ -1,9 +1,16 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
 // RUN:            -fcuda-is-device -target-feature +ptx60 \
 // RUN:            -S -emit-llvm -o - -x cuda %s \
-// RUN:   | FileCheck -check-prefix=CHECK %s
+// RUN:   | FileCheck -check-prefix=CHECK_M16 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_70 \
+// RUN:            -fcuda-is-device -target-feature +ptx61 -DPTX61 \
+// RUN:            -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefixes=CHECK_M16,CHECK_M32_M8 %s
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
-// RUN:   -fcuda-is-device -S -o /dev/null -x cuda -verify %s
+// RUN:   -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-sm_70 %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown \
+// RUN:   -target-cpu sm_70 -target-feature +ptx60 \
+// RUN:   -DPTX61 -fcuda-is-device -S -o /dev/null -x cuda -verify=pre-ptx61 %s
 
 #if !defined(CUDA_VERSION)
 #define __device__ __attribute__((device))
@@ -18,149 +25,443 @@ typedef unsigned long long uint64_t;
 // that encounters an error, so -verify will not be able to find errors in
 // subsequent functions.
 
-// CHECK-LABEL: nvvm_wmma
-__device__ void nvvm_wmma(int *src, int *dst,
-                          float *fsrc, float *fdst,
-                          int ldm) {
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.a.sync.row.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}}
+// CHECK-LABEL: nvvm_wmma_m16n16k16
+__device__ void nvvm_wmma_m16n16k16(int *src, int *dst,
+                                    float *fsrc, float *fdst,
+                                    int ldm) {
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_a(dst, src, ldm, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.a.sync.col.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_a' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_a(dst, src+1, ldm, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.b.sync.row.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_b(dst, src, ldm, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.b.sync.col.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_b' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_b(dst, src+2, ldm, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.row.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.col.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.row.m16n16k16.stride.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.load.c.sync.col.m16n16k16.stride.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.row.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_st_c_f16(dst, src, ldm, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.col.m16n16k16.stride.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_st_c_f16(dst, src, ldm, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.row.m16n16k16.stride.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.store.d.sync.col.m16n16k16.stride.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f16.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f16.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f16.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f16.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f16
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f16.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1);
 
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.row.m16n16k16.f32.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.row.col.m16n16k16.f32.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.row.m16n16k16.f32.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f32
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
-  // CHECK: call {{.*}} @llvm.nvvm.wmma.mma.sync.col.col.m16n16k16.f32.f32.satfinite
-  // expected-error@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature ptx60}}
+  // CHECK_M16: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite
+  // pre-sm_70-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
   __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
 }
+
+#ifdef PTX61
+// CHECK-LABEL: nvvm_wmma_m32n8k16
+__device__ void nvvm_wmma_m32n8k16(int *src, int *dst,
+                                    float *fsrc, float *fdst,
+                                    int ldm) {
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_a(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_a(dst, src+1, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_b(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_b(dst, src+2, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f16(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f16(dst, src, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+
+
+  // m8n32k16 variants.
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_a(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_a(dst, src+1, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_b(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_b(dst, src+2, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f16(dst, src, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f16(dst, src, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 1);
+
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+  // CHECK_M32_M8: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32.satfinite
+  // pre-ptx61-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+  __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+}
+#endif
diff --git a/test/CodeGen/builtins-nvptx.c b/test/CodeGen/builtins-nvptx.c
index 89a982377ad4..16f41bac3432 100644
--- a/test/CodeGen/builtins-nvptx.c
+++ b/test/CodeGen/builtins-nvptx.c
@@ -5,6 +5,9 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_61 \
+// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \
 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
 
@@ -292,245 +295,245 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
 
   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_add_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_add_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_add_gen_ll(&sll, ll);
   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_add_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_add_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_add_gen_ll(&sll, ll);
 
   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32
-  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
   __nvvm_atom_cta_add_gen_f(fp, f);
   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64
-  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
   __nvvm_atom_cta_add_gen_d(dfp, df);
   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32
-  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
   __nvvm_atom_sys_add_gen_f(fp, f);
   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64
-  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
   __nvvm_atom_sys_add_gen_d(dfp, df);
 
   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_xchg_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_xchg_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_xchg_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_xchg_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_max_gen_i(ip, i);
   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
   __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_max_gen_l(&dl, l);
   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
   __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_max_gen_ll(&sll, ll);
   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
   __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_max_gen_i(ip, i);
   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
   __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_max_gen_l(&dl, l);
   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
   __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_max_gen_ll(&sll, ll);
   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
   __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_min_gen_i(ip, i);
   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
   __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_min_gen_l(&dl, l);
   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
   __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_min_gen_ll(&sll, ll);
   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
   __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_min_gen_i(ip, i);
   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
   __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_min_gen_l(&dl, l);
   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
   __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_min_gen_ll(&sll, ll);
   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
   __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
   __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
   __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
 
   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
   __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
   __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
 
   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_and_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_and_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_and_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_and_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_and_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_and_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_or_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_or_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_or_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_or_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_or_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_or_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_xor_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_xor_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_xor_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_xor_gen_i(ip, i);
   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_xor_gen_l(&dl, l);
   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_xor_gen_ll(&sll, ll);
 
   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
   __nvvm_atom_cta_cas_gen_i(ip, i, 0);
   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
   __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
   __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
 
   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
-  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
   __nvvm_atom_sys_cas_gen_i(ip, i, 0);
   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
   __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
-  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature satom}}
+  // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
   __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
 #endif
 
diff --git a/test/CodeGen/builtins-overflow.c b/test/CodeGen/builtins-overflow.c
index 7a30cfbd46ee..57f90eb66a5f 100644
--- a/test/CodeGen/builtins-overflow.c
+++ b/test/CodeGen/builtins-overflow.c
@@ -14,7 +14,7 @@ extern long long LongLongErrorCode;
 void overflowed(void);
 
 unsigned test_add_overflow_uint_uint_uint(unsigned x, unsigned y) {
-  // CHECK-LABEL: define i32 @test_add_overflow_uint_uint_uint
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_add_overflow_uint_uint_uint
   // CHECK-NOT: ext
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %{{.+}}, i32 %{{.+}})
   // CHECK-DAG: [[Q:%.+]] = extractvalue { i32, i1 } [[S]], 0
@@ -28,7 +28,7 @@ unsigned test_add_overflow_uint_uint_uint(unsigned x, unsigned y) {
 }
 
 int test_add_overflow_int_int_int(int x, int y) {
-  // CHECK-LABEL: define i32 @test_add_overflow_int_int_int
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_add_overflow_int_int_int
   // CHECK-NOT: ext
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %{{.+}}, i32 %{{.+}})
   // CHECK-DAG: [[C:%.+]] = extractvalue { i32, i1 } [[S]], 1
@@ -42,7 +42,7 @@ int test_add_overflow_int_int_int(int x, int y) {
 }
 
 unsigned test_sub_overflow_uint_uint_uint(unsigned x, unsigned y) {
-  // CHECK-LABEL: define i32 @test_sub_overflow_uint_uint_uint
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_sub_overflow_uint_uint_uint
   // CHECK-NOT: ext
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %{{.+}}, i32 %{{.+}})
   // CHECK-DAG: [[Q:%.+]] = extractvalue { i32, i1 } [[S]], 0
@@ -56,7 +56,7 @@ unsigned test_sub_overflow_uint_uint_uint(unsigned x, unsigned y) {
 }
 
 int test_sub_overflow_int_int_int(int x, int y) {
-  // CHECK-LABEL: define i32 @test_sub_overflow_int_int_int
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_sub_overflow_int_int_int
   // CHECK-NOT: ext
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %{{.+}}, i32 %{{.+}})
   // CHECK-DAG: [[C:%.+]] = extractvalue { i32, i1 } [[S]], 1
@@ -70,7 +70,7 @@ int test_sub_overflow_int_int_int(int x, int y) {
 }
 
 unsigned test_mul_overflow_uint_uint_uint(unsigned x, unsigned y) {
-  // CHECK-LABEL: define i32 @test_mul_overflow_uint_uint_uint
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_mul_overflow_uint_uint_uint
   // CHECK-NOT: ext
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %{{.+}}, i32 %{{.+}})
   // CHECK-DAG: [[Q:%.+]] = extractvalue { i32, i1 } [[S]], 0
@@ -84,7 +84,7 @@ unsigned test_mul_overflow_uint_uint_uint(unsigned x, unsigned y) {
 }
 
 int test_mul_overflow_int_int_int(int x, int y) {
-  // CHECK-LABEL: define i32 @test_mul_overflow_int_int_int
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_mul_overflow_int_int_int
   // CHECK-NOT: ext
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %{{.+}}, i32 %{{.+}})
   // CHECK-DAG: [[C:%.+]] = extractvalue { i32, i1 } [[S]], 1
@@ -98,7 +98,7 @@ int test_mul_overflow_int_int_int(int x, int y) {
 }
 
 int test_add_overflow_uint_int_int(unsigned x, int y) {
-  // CHECK-LABEL: define i32 @test_add_overflow_uint_int_int
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_add_overflow_uint_int_int
   // CHECK: [[XE:%.+]] = zext i32 %{{.+}} to i33
   // CHECK: [[YE:%.+]] = sext i32 %{{.+}} to i33
   // CHECK: [[S:%.+]] = call { i33, i1 } @llvm.sadd.with.overflow.i33(i33 [[XE]], i33 [[YE]])
@@ -136,7 +136,7 @@ _Bool test_add_overflow_uint_uint_bool(unsigned x, unsigned y) {
 }
 
 unsigned test_add_overflow_bool_bool_uint(_Bool x, _Bool y) {
-  // CHECK-LABEL: define i32 @test_add_overflow_bool_bool_uint
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_add_overflow_bool_bool_uint
   // CHECK: [[XE:%.+]] = zext i1 %{{.+}} to i32
   // CHECK: [[YE:%.+]] = zext i1 %{{.+}} to i32
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[XE]], i32 [[YE]])
@@ -165,7 +165,7 @@ _Bool test_add_overflow_bool_bool_bool(_Bool x, _Bool y) {
 }
 
 int test_add_overflow_volatile(int x, int y) {
-  // CHECK-LABEL: define i32 @test_add_overflow_volatile
+  // CHECK-LABEL: define {{(dso_local )?}}i32 @test_add_overflow_volatile
   // CHECK: [[S:%.+]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %{{.+}}, i32 %{{.+}})
   // CHECK-DAG: [[Q:%.+]] = extractvalue { i32, i1 } [[S]], 0
   // CHECK-DAG: [[C:%.+]] = extractvalue { i32, i1 } [[S]], 1
@@ -373,7 +373,9 @@ int test_mixed_sign_mull_overflow_unsigned(int x, unsigned y) {
 // CHECK-NEXT:  [[NotNull:%.*]] = icmp ne i32 [[UnsignedResult]], 0
 // CHECK-NEXT:  [[Underflow:%.*]] = and i1 [[IsNeg]], [[NotNull]]
 // CHECK-NEXT:  [[OFlow:%.*]] = or i1 [[UnsignedOFlow]], [[Underflow]]
-// CHECK-NEXT:  store i32 [[UnsignedResult]], i32* %{{.*}}, align 4
+// CHECK-NEXT:  [[NegatedResult:%.*]] = sub i32 0, [[UnsignedResult]]
+// CHECK-NEXT:  [[Result:%.*]] = select i1 [[IsNeg]], i32 [[NegatedResult]], i32 [[UnsignedResult]]
+// CHECK-NEXT:  store i32 [[Result]], i32* %{{.*}}, align 4
 // CHECK:       br i1 [[OFlow]]
 
   unsigned result;
@@ -432,7 +434,9 @@ long long test_mixed_sign_mulll_overflow_trunc_unsigned(long long x, unsigned lo
 // CHECK-NEXT:  [[OVERFLOW_PRE_TRUNC:%.*]] = or i1 {{.*}}, [[UNDERFLOW]]
 // CHECK-NEXT:  [[TRUNC_OVERFLOW:%.*]] = icmp ugt i64 [[UNSIGNED_RESULT]], 4294967295
 // CHECK-NEXT:  [[OVERFLOW:%.*]] = or i1 [[OVERFLOW_PRE_TRUNC]], [[TRUNC_OVERFLOW]]
-// CHECK-NEXT:  trunc i64 [[UNSIGNED_RESULT]] to i32
+// CHECK-NEXT:  [[NEGATED:%.*]] = sub i64 0, [[UNSIGNED_RESULT]]
+// CHECK-NEXT:  [[RESULT:%.*]] = select i1 {{.*}}, i64 [[NEGATED]], i64 [[UNSIGNED_RESULT]]
+// CHECK-NEXT:  trunc i64 [[RESULT]] to i32
 // CHECK-NEXT:  store
   unsigned result;
   if (__builtin_mul_overflow(y, x, &result))
diff --git a/test/CodeGen/builtins-ppc-p9-f128.c b/test/CodeGen/builtins-ppc-p9-f128.c
new file mode 100644
index 000000000000..c43915ad2489
--- /dev/null
+++ b/test/CodeGen/builtins-ppc-p9-f128.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm \
+// RUN:   -target-cpu pwr9 -target-feature +float128 -o - %s | FileCheck %s
+
+__float128 A;
+__float128 B;
+__float128 C;
+
+
+__float128 testSqrtOdd() {
+  return __builtin_sqrtf128_round_to_odd(A);
+// CHECK: @llvm.ppc.sqrtf128.round.to.odd(fp128
+// CHECK-NEXT: ret fp128
+}
+
+__float128 testFMAOdd() {
+  return __builtin_fmaf128_round_to_odd(A, B, C);
+// CHECK: @llvm.ppc.fmaf128.round.to.odd(fp128 %{{.+}}, fp128 %{{.+}}, fp128
+// CHECK-NEXT: ret fp128
+}
+
+__float128 testAddOdd() {
+  return __builtin_addf128_round_to_odd(A, B);
+// CHECK: @llvm.ppc.addf128.round.to.odd(fp128 %{{.+}}, fp128
+// CHECK-NEXT: ret fp128
+}
+
+__float128 testSubOdd() {
+  return __builtin_subf128_round_to_odd(A, B);
+// CHECK: @llvm.ppc.subf128.round.to.odd(fp128 %{{.+}}, fp128
+// CHECK-NEXT: ret fp128
+}
+
+__float128 testMulOdd() {
+  return __builtin_mulf128_round_to_odd(A, B);
+// CHECK: @llvm.ppc.mulf128.round.to.odd(fp128 %{{.+}}, fp128
+// CHECK-NEXT: ret fp128
+}
+
+__float128 testDivOdd() {
+  return __builtin_divf128_round_to_odd(A, B);
+// CHECK: @llvm.ppc.divf128.round.to.odd(fp128 %{{.+}}, fp128
+// CHECK-NEXT: ret fp128
+}
+
+double testTruncOdd() {
+  return __builtin_truncf128_round_to_odd(A);
+// CHECK: @llvm.ppc.truncf128.round.to.odd(fp128
+// CHECK-NEXT: ret double
+}
+
diff --git a/test/CodeGen/builtins-ppc-p9vector.c b/test/CodeGen/builtins-ppc-p9vector.c
index f92df86561ad..bfbb81585471 100644
--- a/test/CodeGen/builtins-ppc-p9vector.c
+++ b/test/CodeGen/builtins-ppc-p9vector.c
@@ -983,7 +983,7 @@ vector bool int test86(void) {
 }
 vector bool long long test87(void) {
 // CHECK-BE: @llvm.ppc.vsx.xvtstdcdp(<2 x double> {{.+}}, i32 127)
-// CHECK-BE_NEXT: ret <2 x i64
+// CHECK-BE-NEXT: ret <2 x i64>
 // CHECK: @llvm.ppc.vsx.xvtstdcdp(<2 x double> {{.+}}, i32 127)
 // CHECK-NEXT: ret <2 x i64>
   return vec_test_data_class(vda, __VEC_CLASS_FP_NOT_NORMAL);
diff --git a/test/CodeGen/builtins-ppc-vsx.c b/test/CodeGen/builtins-ppc-vsx.c
index 6dda19d5d9f3..848d24d4fb10 100644
--- a/test/CodeGen/builtins-ppc-vsx.c
+++ b/test/CodeGen/builtins-ppc-vsx.c
@@ -1694,43 +1694,43 @@ vec_xst_be(vd, sll, ad);
 
 res_vd = vec_xxpermdi(vd, vd, 0);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 2>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 3, i32 1>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 2>
 
 res_vf = vec_xxpermdi(vf, vf, 1);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 3>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 2, i32 1>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 3>
 
 res_vsll = vec_xxpermdi(vsll, vsll, 2);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 2>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 3, i32 0>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 2>
 
 res_vull = vec_xxpermdi(vull, vull, 3);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 3>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 2, i32 0>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 3>
 
 res_vsi = vec_xxpermdi(vsi, vsi, 0);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 2>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 3, i32 1>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 2>
 
 res_vui = vec_xxpermdi(vui, vui, 1);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 3>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 2, i32 1>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 3>
 
 res_vss = vec_xxpermdi(vss, vss, 2);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 2>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 3, i32 0>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 2>
 
 res_vus = vec_xxpermdi(vus, vus, 3);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 3>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 2, i32 0>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 1, i32 3>
 
 res_vsc = vec_xxpermdi(vsc, vsc, 0);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 2>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 3, i32 1>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 2>
 
 res_vuc = vec_xxpermdi(vuc, vuc, 1);
 // CHECK: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 3>
-// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 2, i32 1>
+// CHECK-LE: shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 3>
 
 res_vd = vec_xxsldwi(vd, vd, 0);
 // CHECK: shufflevector <4 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1786,7 +1786,7 @@ vector int xxpermdi_should_not_assert(vector int a, vector int b) {
 
 // CHECK-LE:  bitcast <4 x i32> %{{[0-9]+}} to <2 x i64>
 // CHECK-LE-NEXT:  bitcast <4 x i32> %{{[0-9]+}} to <2 x i64>
-// CHECK-LE-NEXT:  shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 3, i32 1>
+// CHECK-LE-NEXT:  shufflevector <2 x i64> %{{[0-9]+}}, <2 x i64> %{{[0-9]+}}, <2 x i32> <i32 0, i32 2>
 // CHECK-LE-NEXT:  bitcast <2 x i64> %{{[0-9]+}} to <4 x i32>
 }
 
diff --git a/test/CodeGen/builtins-systemz-vector-error.c b/test/CodeGen/builtins-systemz-vector-error.c
index 6f4ecc2acad3..f0db90108279 100644
--- a/test/CodeGen/builtins-systemz-vector-error.c
+++ b/test/CodeGen/builtins-systemz-vector-error.c
@@ -27,148 +27,148 @@ const void * volatile cptr;
 int cc;
 
 void test_core(void) {
-  __builtin_s390_lcbb(cptr, -1);       // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_lcbb(cptr, 16);       // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_lcbb(cptr, -1);       // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_lcbb(cptr, 16);       // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_lcbb(cptr, len);      // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vlbb(cptr, -1);       // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vlbb(cptr, 16);       // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vlbb(cptr, -1);       // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vlbb(cptr, 16);       // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vlbb(cptr, len);      // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vpdi(vul, vul, -1);   // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vpdi(vul, vul, 16);   // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vpdi(vul, vul, -1);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vpdi(vul, vul, 16);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vpdi(vul, vul, len);  // expected-error {{must be a constant integer}}
 }
 
 void test_integer(void) {
-  __builtin_s390_verimb(vuc, vuc, vuc, -1);    // expected-error {{argument should be a value from 0 to 255}}
-  __builtin_s390_verimb(vuc, vuc, vuc, 256);   // expected-error {{argument should be a value from 0 to 255}}
+  __builtin_s390_verimb(vuc, vuc, vuc, -1);    // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_verimb(vuc, vuc, vuc, 256);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_verimb(vuc, vuc, vuc, len);   // expected-error {{must be a constant integer}}
 
-  __builtin_s390_verimh(vus, vus, vus, -1);    // expected-error {{argument should be a value from 0 to 255}}
-  __builtin_s390_verimh(vus, vus, vus, 256);   // expected-error {{argument should be a value from 0 to 255}}
+  __builtin_s390_verimh(vus, vus, vus, -1);    // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_verimh(vus, vus, vus, 256);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_verimh(vus, vus, vus, len);   // expected-error {{must be a constant integer}}
 
-  __builtin_s390_verimf(vui, vui, vui, -1);    // expected-error {{argument should be a value from 0 to 255}}
-  __builtin_s390_verimf(vui, vui, vui, 256);   // expected-error {{argument should be a value from 0 to 255}}
+  __builtin_s390_verimf(vui, vui, vui, -1);    // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_verimf(vui, vui, vui, 256);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_verimf(vui, vui, vui, len);   // expected-error {{must be a constant integer}}
 
-  __builtin_s390_verimg(vul, vul, vul, -1);    // expected-error {{argument should be a value from 0 to 255}}
-  __builtin_s390_verimg(vul, vul, vul, 256);   // expected-error {{argument should be a value from 0 to 255}}
+  __builtin_s390_verimg(vul, vul, vul, -1);    // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_verimg(vul, vul, vul, 256);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_verimg(vul, vul, vul, len);   // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vsldb(vuc, vuc, -1);          // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vsldb(vuc, vuc, 16);          // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vsldb(vuc, vuc, -1);          // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vsldb(vuc, vuc, 16);          // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vsldb(vuc, vuc, len);         // expected-error {{must be a constant integer}}
 }
 
 void test_string(void) {
-  __builtin_s390_vfaeb(vuc, vuc, -1);               // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaeb(vuc, vuc, 16);               // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaeb(vuc, vuc, -1);               // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaeb(vuc, vuc, 16);               // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaeb(vuc, vuc, len);              // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaeh(vus, vus, -1);               // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaeh(vus, vus, 16);               // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaeh(vus, vus, -1);               // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaeh(vus, vus, 16);               // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaeh(vus, vus, len);              // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaef(vui, vui, -1);               // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaef(vui, vui, 16);               // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaef(vui, vui, -1);               // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaef(vui, vui, 16);               // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaef(vui, vui, len);              // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaezb(vuc, vuc, -1);              // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaezb(vuc, vuc, 16);              // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaezb(vuc, vuc, -1);              // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaezb(vuc, vuc, 16);              // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaezb(vuc, vuc, len);             // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaezh(vus, vus, -1);              // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaezh(vus, vus, 16);              // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaezh(vus, vus, -1);              // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaezh(vus, vus, 16);              // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaezh(vus, vus, len);             // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaezf(vui, vui, -1);              // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaezf(vui, vui, 16);              // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaezf(vui, vui, -1);              // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaezf(vui, vui, 16);              // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaezf(vui, vui, len);             // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrcb(vuc, vuc, vuc, -1);         // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrcb(vuc, vuc, vuc, 16);         // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrcb(vuc, vuc, vuc, -1);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrcb(vuc, vuc, vuc, 16);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrcb(vuc, vuc, vuc, len);        // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrch(vus, vus, vus, -1);         // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrch(vus, vus, vus, 16);         // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrch(vus, vus, vus, -1);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrch(vus, vus, vus, 16);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrch(vus, vus, vus, len);        // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrcf(vui, vui, vui, -1);         // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrcf(vui, vui, vui, 16);         // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrcf(vui, vui, vui, -1);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrcf(vui, vui, vui, 16);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrcf(vui, vui, vui, len);        // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrczb(vuc, vuc, vuc, -1);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrczb(vuc, vuc, vuc, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrczb(vuc, vuc, vuc, -1);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrczb(vuc, vuc, vuc, 16);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrczb(vuc, vuc, vuc, len);       // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrczh(vus, vus, vus, -1);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrczh(vus, vus, vus, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrczh(vus, vus, vus, -1);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrczh(vus, vus, vus, 16);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrczh(vus, vus, vus, len);       // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrczf(vui, vui, vui, -1);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrczf(vui, vui, vui, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrczf(vui, vui, vui, -1);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrczf(vui, vui, vui, 16);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrczf(vui, vui, vui, len);       // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaebs(vuc, vuc, -1, &cc);         // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaebs(vuc, vuc, 16, &cc);         // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaebs(vuc, vuc, -1, &cc);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaebs(vuc, vuc, 16, &cc);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaebs(vuc, vuc, len, &cc);        // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaehs(vus, vus, -1, &cc);         // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaehs(vus, vus, 16, &cc);         // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaehs(vus, vus, -1, &cc);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaehs(vus, vus, 16, &cc);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaehs(vus, vus, len, &cc);        // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaefs(vui, vui, -1, &cc);         // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaefs(vui, vui, 16, &cc);         // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaefs(vui, vui, -1, &cc);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaefs(vui, vui, 16, &cc);         // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaefs(vui, vui, len, &cc);        // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaezbs(vuc, vuc, -1, &cc);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaezbs(vuc, vuc, 16, &cc);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaezbs(vuc, vuc, -1, &cc);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaezbs(vuc, vuc, 16, &cc);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaezbs(vuc, vuc, len, &cc);       // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaezhs(vus, vus, -1, &cc);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaezhs(vus, vus, 16, &cc);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaezhs(vus, vus, -1, &cc);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaezhs(vus, vus, 16, &cc);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaezhs(vus, vus, len, &cc);       // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfaezfs(vui, vui, -1, &cc);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfaezfs(vui, vui, 16, &cc);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfaezfs(vui, vui, -1, &cc);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfaezfs(vui, vui, 16, &cc);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfaezfs(vui, vui, len, &cc);       // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrcbs(vuc, vuc, vuc, -1, &cc);   // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrcbs(vuc, vuc, vuc, 16, &cc);   // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrcbs(vuc, vuc, vuc, -1, &cc);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrcbs(vuc, vuc, vuc, 16, &cc);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrcbs(vuc, vuc, vuc, len, &cc);  // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrchs(vus, vus, vus, -1, &cc);   // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrchs(vus, vus, vus, 16, &cc);   // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrchs(vus, vus, vus, -1, &cc);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrchs(vus, vus, vus, 16, &cc);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrchs(vus, vus, vus, len, &cc);  // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrcfs(vui, vui, vui, -1, &cc);   // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrcfs(vui, vui, vui, 16, &cc);   // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrcfs(vui, vui, vui, -1, &cc);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrcfs(vui, vui, vui, 16, &cc);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrcfs(vui, vui, vui, len, &cc);  // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrczbs(vuc, vuc, vuc, -1, &cc);  // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrczbs(vuc, vuc, vuc, 16, &cc);  // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrczbs(vuc, vuc, vuc, -1, &cc);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrczbs(vuc, vuc, vuc, 16, &cc);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrczbs(vuc, vuc, vuc, len, &cc); // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrczhs(vus, vus, vus, -1, &cc);  // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrczhs(vus, vus, vus, 16, &cc);  // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrczhs(vus, vus, vus, -1, &cc);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrczhs(vus, vus, vus, 16, &cc);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrczhs(vus, vus, vus, len, &cc); // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vstrczfs(vui, vui, vui, -1, &cc);  // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vstrczfs(vui, vui, vui, 16, &cc);  // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vstrczfs(vui, vui, vui, -1, &cc);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vstrczfs(vui, vui, vui, 16, &cc);  // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vstrczfs(vui, vui, vui, len, &cc); // expected-error {{must be a constant integer}}
 }
 
 void test_float(void) {
-  __builtin_s390_vftcidb(vd, -1, &cc);              // expected-error {{argument should be a value from 0 to 4095}}
-  __builtin_s390_vftcidb(vd, 4096, &cc);            // expected-error {{argument should be a value from 0 to 4095}}
+  __builtin_s390_vftcidb(vd, -1, &cc);              // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vftcidb(vd, 4096, &cc);            // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vftcidb(vd, len, &cc);             // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfidb(vd, -1, 0);                  // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfidb(vd, 16, 0);                  // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfidb(vd, -1, 0);                  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfidb(vd, 16, 0);                  // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfidb(vd, len, 0);                 // expected-error {{must be a constant integer}}
-  __builtin_s390_vfidb(vd, 0, -1);                  // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfidb(vd, 0, 16);                  // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfidb(vd, 0, -1);                  // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfidb(vd, 0, 16);                  // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfidb(vd, 0, len);                 // expected-error {{must be a constant integer}}
 }
diff --git a/test/CodeGen/builtins-systemz-vector2-error.c b/test/CodeGen/builtins-systemz-vector2-error.c
index ceebaf8cc72f..cd27ac79e15d 100644
--- a/test/CodeGen/builtins-systemz-vector2-error.c
+++ b/test/CodeGen/builtins-systemz-vector2-error.c
@@ -28,34 +28,34 @@ volatile unsigned int len;
 int cc;
 
 void test_integer(void) {
-  __builtin_s390_vmslg(vul, vul, vuc, -1);   // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vmslg(vul, vul, vuc, 16);   // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vmslg(vul, vul, vuc, -1);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vmslg(vul, vul, vuc, 16);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vmslg(vul, vul, vuc, len);  // expected-error {{must be a constant integer}}
 }
 
 void test_float(void) {
-  __builtin_s390_vfmaxdb(vd, vd, -1);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfmaxdb(vd, vd, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmaxdb(vd, vd, -1);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfmaxdb(vd, vd, 16);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfmaxdb(vd, vd, len);       // expected-error {{must be a constant integer}}
-  __builtin_s390_vfmindb(vd, vd, -1);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfmindb(vd, vd, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmindb(vd, vd, -1);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfmindb(vd, vd, 16);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfmindb(vd, vd, len);       // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vftcisb(vf, -1, &cc);       // expected-error {{argument should be a value from 0 to 4095}}
-  __builtin_s390_vftcisb(vf, 4096, &cc);     // expected-error {{argument should be a value from 0 to 4095}}
+  __builtin_s390_vftcisb(vf, -1, &cc);       // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vftcisb(vf, 4096, &cc);     // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vftcisb(vf, len, &cc);      // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfisb(vf, -1, 0);           // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfisb(vf, 16, 0);           // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfisb(vf, -1, 0);           // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfisb(vf, 16, 0);           // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfisb(vf, len, 0);          // expected-error {{must be a constant integer}}
-  __builtin_s390_vfisb(vf, 0, -1);           // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfisb(vf, 0, 16);           // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfisb(vf, 0, -1);           // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfisb(vf, 0, 16);           // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfisb(vf, 0, len);          // expected-error {{must be a constant integer}}
 
-  __builtin_s390_vfmaxsb(vf, vf, -1);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfmaxsb(vf, vf, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmaxsb(vf, vf, -1);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfmaxsb(vf, vf, 16);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfmaxsb(vf, vf, len);       // expected-error {{must be a constant integer}}
-  __builtin_s390_vfminsb(vf, vf, -1);        // expected-error {{argument should be a value from 0 to 15}}
-  __builtin_s390_vfminsb(vf, vf, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfminsb(vf, vf, -1);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  __builtin_s390_vfminsb(vf, vf, 16);        // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_s390_vfminsb(vf, vf, len);       // expected-error {{must be a constant integer}}
 }
diff --git a/test/CodeGen/builtins-systemz-zvector-error.c b/test/CodeGen/builtins-systemz-zvector-error.c
index 8d5aaabfbac3..cb60ea495ece 100644
--- a/test/CodeGen/builtins-systemz-zvector-error.c
+++ b/test/CodeGen/builtins-systemz-zvector-error.c
@@ -560,6 +560,6 @@ void test_float(void) {
                            // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 31}}
 
   vbl = vec_fp_test_data_class(vd, idx, &cc);  // expected-error {{must be a constant integer}}
-  vbl = vec_fp_test_data_class(vd, -1, &cc);   // expected-error {{should be a value from 0 to 4095}}
-  vbl = vec_fp_test_data_class(vd, 4096, &cc); // expected-error {{should be a value from 0 to 4095}}
+  vbl = vec_fp_test_data_class(vd, -1, &cc);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vbl = vec_fp_test_data_class(vd, 4096, &cc); // expected-error-re {{argument value {{.*}} is outside the valid range}}
 }
diff --git a/test/CodeGen/builtins-systemz-zvector2-error.c b/test/CodeGen/builtins-systemz-zvector2-error.c
index 823d6374dc0e..3b890b9d09af 100644
--- a/test/CodeGen/builtins-systemz-zvector2-error.c
+++ b/test/CodeGen/builtins-systemz-zvector2-error.c
@@ -127,8 +127,8 @@ void test_integer(void) {
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
 
   vuc = vec_msum_u128(vul, vul, vuc, idx);  // expected-error {{must be a constant integer}}
-  vuc = vec_msum_u128(vul, vul, vuc, -1);   // expected-error {{should be a value from 0 to 15}}
-  vuc = vec_msum_u128(vul, vul, vuc, 16);   // expected-error {{should be a value from 0 to 15}}
+  vuc = vec_msum_u128(vul, vul, vuc, -1);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
+  vuc = vec_msum_u128(vul, vul, vuc, 16);   // expected-error-re {{argument value {{.*}} is outside the valid range}}
 }
 
 void test_float(void) {
diff --git a/test/CodeGen/builtins-wasm.c b/test/CodeGen/builtins-wasm.c
index e0f72d2e5034..1b647b56d1a2 100644
--- a/test/CodeGen/builtins-wasm.c
+++ b/test/CodeGen/builtins-wasm.c
@@ -3,25 +3,49 @@
 // RUN: %clang_cc1 -triple wasm64-unknown-unknown -O3 -emit-llvm -o - %s \
 // RUN:   | FileCheck %s -check-prefix=WEBASSEMBLY64
 
-__SIZE_TYPE__ f1(void) {
+__SIZE_TYPE__ f0(void) {
+  return __builtin_wasm_memory_size(0);
+// WEBASSEMBLY32: call {{i.*}} @llvm.wasm.memory.size.i32(i32 0)
+// WEBASSEMBLY64: call {{i.*}} @llvm.wasm.memory.size.i64(i32 0)
+}
+
+__SIZE_TYPE__ f1(__SIZE_TYPE__ delta) {
+  return __builtin_wasm_memory_grow(0, delta);
+// WEBASSEMBLY32: call i32 @llvm.wasm.memory.grow.i32(i32 0, i32 %{{.*}})
+// WEBASSEMBLY64: call i64 @llvm.wasm.memory.grow.i64(i32 0, i64 %{{.*}})
+}
+
+__SIZE_TYPE__ f2(void) {
+  return __builtin_wasm_mem_size(0);
+// WEBASSEMBLY32: call {{i.*}} @llvm.wasm.mem.size.i32(i32 0)
+// WEBASSEMBLY64: call {{i.*}} @llvm.wasm.mem.size.i64(i32 0)
+}
+
+__SIZE_TYPE__ f3(__SIZE_TYPE__ delta) {
+  return __builtin_wasm_mem_grow(0, delta);
+// WEBASSEMBLY32: call i32 @llvm.wasm.mem.grow.i32(i32 0, i32 %{{.*}})
+// WEBASSEMBLY64: call i64 @llvm.wasm.mem.grow.i64(i32 0, i64 %{{.*}})
+}
+
+__SIZE_TYPE__ f4(void) {
   return __builtin_wasm_current_memory();
 // WEBASSEMBLY32: call {{i.*}} @llvm.wasm.current.memory.i32()
 // WEBASSEMBLY64: call {{i.*}} @llvm.wasm.current.memory.i64()
 }
 
-__SIZE_TYPE__ f2(__SIZE_TYPE__ delta) {
+__SIZE_TYPE__ f5(__SIZE_TYPE__ delta) {
   return __builtin_wasm_grow_memory(delta);
 // WEBASSEMBLY32: call i32 @llvm.wasm.grow.memory.i32(i32 %{{.*}})
 // WEBASSEMBLY64: call i64 @llvm.wasm.grow.memory.i64(i64 %{{.*}})
 }
 
-void f3(unsigned int tag, void *obj) {
+void f6(unsigned int tag, void *obj) {
   return __builtin_wasm_throw(tag, obj);
 // WEBASSEMBLY32: call void @llvm.wasm.throw(i32 %{{.*}}, i8* %{{.*}})
 // WEBASSEMBLY64: call void @llvm.wasm.throw(i32 %{{.*}}, i8* %{{.*}})
 }
 
-void f4() {
+void f7(void) {
   return __builtin_wasm_rethrow();
 // WEBASSEMBLY32: call void @llvm.wasm.rethrow()
 // WEBASSEMBLY64: call void @llvm.wasm.rethrow()
diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c
index fc3cc448cf3c..fd99dd2be31f 100644
--- a/test/CodeGen/builtins-x86.c
+++ b/test/CodeGen/builtins-x86.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +ibt -target-feature +shstk -emit-llvm -o %t %s
-// RUN: %clang_cc1 -DUSE_ALL -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +ibt -target-feature +shstk -target-feature +clzero -fsyntax-only -o %t %s
+// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +shstk -target-feature +wbnoinvd -target-feature +cldemote -emit-llvm -o %t %s
+// RUN: %clang_cc1 -DUSE_ALL -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +shstk -target-feature +clzero -target-feature +wbnoinvd -target-feature +cldemote -fsyntax-only -o %t %s
 
 #ifdef USE_ALL
 #define USE_3DNOW
@@ -295,6 +295,7 @@ void f0() {
   (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui);
   (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui);
   (void) __builtin_ia32_clzero(tmp_vp);
+  (void) __builtin_ia32_cldemote(tmp_vp);
 
   tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
   tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
@@ -305,6 +306,7 @@ void f0() {
   tmp_i = __rdtsc();
   tmp_i = __builtin_ia32_rdtscp(&tmp_Ui);
   tmp_LLi = __builtin_ia32_rdpmc(tmp_i);
+  __builtin_ia32_wbnoinvd();
 #ifdef USE_64
   tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
   tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
@@ -336,7 +338,6 @@ void f0() {
   tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
   tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
   tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
-  tmp_V4f = __builtin_ia32_cvtdq2ps(tmp_V4i);
   tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
   tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
   tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
@@ -432,7 +433,6 @@ void f0() {
   tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7);
   tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0);
   tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0);
-  tmp_V8f = __builtin_ia32_cvtdq2ps256(tmp_V8i);
   tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);
   tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f);
   tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d);
@@ -466,8 +466,6 @@ void f0() {
   tmp_i = __builtin_ia32_movmskps256(tmp_V8f);
   __builtin_ia32_vzeroall();
   __builtin_ia32_vzeroupper();
-  tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp);
-  tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp);
   tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
   tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
   tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
diff --git a/test/CodeGen/builtins.c b/test/CodeGen/builtins.c
index 4f84db00cbd7..77b479e4c112 100644
--- a/test/CodeGen/builtins.c
+++ b/test/CodeGen/builtins.c
@@ -422,6 +422,28 @@ void test_builtin_os_log(void *buf, int i, const char *data) {
   // CHECK: %[[V6:.*]] = ptrtoint i8* %[[V5]] to i64
   // CHECK: call void @__os_log_helper_1_3_4_4_0_8_34_4_17_8_49(i8* %[[V1]], i32 %[[V2]], i64 %[[V4]], i32 16, i64 %[[V6]])
   __builtin_os_log_format(buf, "%d %{public}s %{private}.16P", i, data, data);
+
+  // privacy annotations aren't recognized when they are preceded or followed
+  // by non-whitespace characters.
+
+  // CHECK: call void @__os_log_helper_1_2_1_8_32(
+  __builtin_os_log_format(buf, "%{xyz public}s", data);
+
+  // CHECK: call void @__os_log_helper_1_2_1_8_32(
+  __builtin_os_log_format(buf, "%{ public xyz}s", data);
+
+  // CHECK: call void @__os_log_helper_1_2_1_8_32(
+  __builtin_os_log_format(buf, "%{ public1}s", data);
+
+  // Privacy annotations do not have to be in the first comma-delimited string.
+
+  // CHECK: call void @__os_log_helper_1_2_1_8_34(
+  __builtin_os_log_format(buf, "%{ xyz, public }s", "abc");
+
+  // The last privacy annotation in the string wins.
+
+  // CHECK: call void @__os_log_helper_1_3_1_8_33(
+  __builtin_os_log_format(buf, "%{ private, public, private, public}s", "abc");
 }
 
 // CHECK-LABEL: define linkonce_odr hidden void @__os_log_helper_1_3_4_4_0_8_34_4_17_8_49
diff --git a/test/CodeGen/c-strings.c b/test/CodeGen/c-strings.c
index 4e14d988a5dc..a94cd8920e06 100644
--- a/test/CodeGen/c-strings.c
+++ b/test/CodeGen/c-strings.c
@@ -4,16 +4,16 @@
 // Should be 3 hello strings, two global (of different sizes), the rest are
 // shared.
 
-// CHECK: @align = global i8 [[ALIGN:[0-9]+]]
+// CHECK: @align = {{(dso_local )?}}global i8 [[ALIGN:[0-9]+]]
 // ITANIUM: @.str = private unnamed_addr constant [6 x i8] c"hello\00"
-// MSABI: @"\01??_C@_05CJBACGMB@hello?$AA@" = linkonce_odr unnamed_addr constant [6 x i8] c"hello\00", comdat, align 1
+// MSABI: @"??_C@_05CJBACGMB@hello?$AA@" = linkonce_odr dso_local unnamed_addr constant [6 x i8] c"hello\00", comdat, align 1
 // ITANIUM: @f1.x = internal global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0)
-// MSABI: @f1.x = internal global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @"\01??_C@_05CJBACGMB@hello?$AA@", i32 0, i32 0)
+// MSABI: @f1.x = internal global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @"??_C@_05CJBACGMB@hello?$AA@", i32 0, i32 0)
 // CHECK: @f2.x = internal global [6 x i8] c"hello\00", align [[ALIGN]]
 // CHECK: @f3.x = internal global [8 x i8] c"hello\00\00\00", align [[ALIGN]]
 // ITANIUM: @f4.x = internal global %struct.s { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0) }
-// MSABI: @f4.x = internal global %struct.s { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @"\01??_C@_05CJBACGMB@hello?$AA@", i32 0, i32 0) }
-// CHECK: @x = global [3 x i8] c"ola", align [[ALIGN]]
+// MSABI: @f4.x = internal global %struct.s { i8* getelementptr inbounds ([6 x i8], [6 x i8]* @"??_C@_05CJBACGMB@hello?$AA@", i32 0, i32 0) }
+// CHECK: @x = {{(dso_local )?}}global [3 x i8] c"ola", align [[ALIGN]]
 
 // XFAIL: hexagon
 // Hexagon aligns arrays of size 8+ bytes to a 64-bit boundary, which
@@ -32,7 +32,7 @@ void bar(const char *);
 void f0() {
   bar("hello");
   // ITANIUM: call {{.*}}void @bar({{.*}} @.str
-  // MSABI: call {{.*}}void @bar({{.*}} @"\01??_C@_05CJBACGMB@hello?$AA@"
+  // MSABI: call {{.*}}void @bar({{.*}} @"??_C@_05CJBACGMB@hello?$AA@"
 }
 
 // CHECK-LABEL: define {{.*}}void @f1()
diff --git a/test/CodeGen/c11atomics-ios.c b/test/CodeGen/c11atomics-ios.c
index fb731dfd7a1f..f48e10e4aa43 100644
--- a/test/CodeGen/c11atomics-ios.c
+++ b/test/CodeGen/c11atomics-ios.c
@@ -132,7 +132,7 @@ void testStruct(_Atomic(S) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = load [[S]]*, [[S]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T1]], i8* align 2 [[T2]], i32 8, i1 false)
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[TMP0]] to i64*
 // CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8
 // CHECK-NEXT: [[T5:%.*]] = bitcast [[S]]* [[T0]] to i64*
@@ -154,7 +154,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 
 // CHECK-NEXT: [[P:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[P]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -165,7 +165,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
   __c11_atomic_init(fp, (PS){1,2,3});
 
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T0]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T0]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[X]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -183,16 +183,16 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 8 [[T2]], i32 6, i1 false)
   PS f = *fp;
 
 // CHECK-NEXT: [[T0:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[TMP1]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T1]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T1]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP1]], i32 0, i32 0
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T2]], i8* align 2 [[T3]], i32 6, i1 false)
 // CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[TMP1]] to i64*
 // CHECK-NEXT: [[T5:%.*]] = load i64, i64* [[T4]], align 8
 // CHECK-NEXT: [[T6:%.*]] = bitcast [[APS]]* [[T0]] to i64*
@@ -215,7 +215,7 @@ PS test_promoted_load(_Atomic(PS) *addr) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
 
   return __c11_atomic_load(addr, 5);
 }
@@ -232,11 +232,11 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 8
   // CHECK:   store atomic i64 [[VAL64]], i64* [[ADDR64]] seq_cst, align 8
@@ -257,11 +257,11 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[ATOMIC_RES64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_RES]] to i64*
   // CHECK:   [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 8
@@ -270,7 +270,7 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
   return __c11_atomic_exchange(addr, *val, 5);
 }
 
@@ -291,15 +291,15 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   [[NEW:%.*]] = load %struct.PS*, %struct.PS** [[NEW_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[NEW8:%.*]] = bitcast %struct.PS* [[NEW]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[NEW8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[NEW8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_DESIRED8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED:%.*]] to i8*
   // CHECK:   [[DESIRED8:%.*]] = bitcast %struct.PS* [[DESIRED]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_DESIRED8]], i8* [[DESIRED8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_DESIRED8]], i8* align 2 [[DESIRED8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_DESIRED64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED:%.*]] to i64*
   // CHECK:   [[ATOMIC_NEW8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_NEW8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_NEW8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_NEW64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i64*
   // CHECK:   [[ATOMIC_DESIRED_VAL64:%.*]] = load i64, i64* [[ATOMIC_DESIRED64]], align 8
   // CHECK:   [[ATOMIC_NEW_VAL64:%.*]] = load i64, i64* [[ATOMIC_NEW64]], align 8
@@ -319,3 +319,19 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
 
   return __c11_atomic_compare_exchange_strong(addr, desired, *new, 5, 5);
 }
+
+struct Empty {};
+
+struct Empty testEmptyStructLoad(_Atomic(struct Empty)* empty) {
+  // CHECK-LABEL: @testEmptyStructLoad(
+  // CHECK-NOT: @__atomic_load
+  // CHECK: load atomic i8, i8* %{{.*}} seq_cst, align 1
+  return *empty;
+}
+
+void testEmptyStructStore(_Atomic(struct Empty)* empty, struct Empty value) {
+  // CHECK-LABEL: @testEmptyStructStore(
+  // CHECK-NOT: @__atomic_store
+  // CHECK: store atomic i8 %{{.*}}, i8* %{{.*}} seq_cst, align 1
+  *empty = value;
+}
diff --git a/test/CodeGen/c11atomics.c b/test/CodeGen/c11atomics.c
index ccb642174c94..cf251738be55 100644
--- a/test/CodeGen/c11atomics.c
+++ b/test/CodeGen/c11atomics.c
@@ -282,7 +282,7 @@ void testStruct(_Atomic(S) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = load [[S]]*, [[S]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T1]], i8* align 2 [[T2]], i32 8, i1 false)
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[T0]] to i8*
 // CHECK-NEXT: [[T4:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
 // CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T3]], i8* [[T4]], i32 5)
@@ -307,7 +307,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 
 // CHECK-NEXT: [[P:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[P]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -318,7 +318,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
   __c11_atomic_init(fp, (PS){1,2,3});
 
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T0]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T0]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[X]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
 // CHECK-NEXT: store i16 1, i16* [[T1]], align 8
@@ -335,16 +335,16 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 8 [[T2]], i32 6, i1 false)
   PS f = *fp;
 
 // CHECK-NEXT: [[T0:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
 // CHECK-NEXT: [[T1:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[TMP1]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T1]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T1]], i8 0, i32 8, i1 false)
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP1]], i32 0, i32 0
 // CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
 // CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[T2]], i8* align 2 [[T3]], i32 6, i1 false)
 // CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[T0]] to i8*
 // CHECK-NEXT: [[T5:%.*]] = bitcast [[APS]]* [[TMP1]] to i8*
 // CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T4]], i8* [[T5]], i32 5)
@@ -357,7 +357,7 @@ void testPromotedStruct(_Atomic(PS) *fp) {
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP3]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = bitcast %struct.PS* [[TMP2]] to i8*
 // CHECK-NEXT: [[T2:%.*]] = bitcast %struct.PS* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 8 [[T2]], i32 6, i1 false)
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds %struct.PS, %struct.PS* [[TMP2]], i32 0, i32 0
 // CHECK-NEXT: [[T1:%.*]] = load i16, i16* [[T0]], align 2
 // CHECK-NEXT: [[T2:%.*]] = sext i16 [[T1]] to i32
@@ -381,7 +381,7 @@ PS test_promoted_load(_Atomic(PS) *addr) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
 
   return __c11_atomic_load(addr, 5);
 }
@@ -398,11 +398,11 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
   // CHECK:   [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 2
@@ -423,11 +423,11 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
   // CHECK:   [[ATOMIC_RES64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_RES]] to i64*
   // CHECK:   [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
@@ -437,7 +437,7 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
   // CHECK:   [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
   // CHECK:   [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 8 [[ATOMIC_RES8]], i32 6, i1 false)
   return __c11_atomic_exchange(addr, *val, 5);
 }
 
@@ -457,15 +457,15 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   [[NEW:%.*]] = load %struct.PS*, %struct.PS** [[NEW_ARG]], align 4
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
   // CHECK:   [[NEW8:%.*]] = bitcast %struct.PS* [[NEW]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[NEW8]], i32 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[NEW8]], i32 6, i1 false)
   // CHECK:   [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
   // CHECK:   [[ATOMIC_DESIRED8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED]] to i8*
   // CHECK:   [[DESIRED8:%.*]] = bitcast %struct.PS* [[DESIRED]]to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_DESIRED8]], i8* [[DESIRED8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_DESIRED8]], i8* align 2 [[DESIRED8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_DESIRED64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED]] to i64*
   // CHECK:   [[ATOMIC_NEW8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i8*
   // CHECK:   [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
-  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_NEW8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+  // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[ATOMIC_NEW8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
   // CHECK:   [[ATOMIC_NEW64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i64*
   // CHECK:   [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
   // CHECK:   [[ATOMIC_DESIRED8:%.*]] = bitcast i64* [[ATOMIC_DESIRED64]] to i8*
@@ -474,3 +474,17 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   ret i1 [[RES]]
   return __c11_atomic_compare_exchange_strong(addr, desired, *new, 5, 5);
 }
+
+struct Empty {};
+
+struct Empty test_empty_struct_load(_Atomic(struct Empty)* empty) {
+  // CHECK-LABEL: @test_empty_struct_load(
+  // CHECK: call arm_aapcscc zeroext i8 @__atomic_load_1(i8* %{{.*}}, i32 5)
+  return __c11_atomic_load(empty, 5);
+}
+
+void test_empty_struct_store(_Atomic(struct Empty)* empty, struct Empty value) {
+  // CHECK-LABEL: @test_empty_struct_store(
+  // CHECK: call arm_aapcscc void @__atomic_store_1(i8* %{{.*}}, i8 zeroext %{{.*}}, i32 5)
+  __c11_atomic_store(empty, value, 5);
+}
diff --git a/test/CodeGen/cetintrin.c b/test/CodeGen/cetintrin.c
index 085462a6626d..f70d1c80e1cf 100644
--- a/test/CodeGen/cetintrin.c
+++ b/test/CodeGen/cetintrin.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=i386-apple-darwin -target-feature +shstk -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +shstk  -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=X86_64
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386-unknown-unknown -target-feature +shstk -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=I386 --check-prefix=CHECK
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +shstk  -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=X86_64  --check-prefix=CHECK
 
 #include <immintrin.h>
 
@@ -15,6 +15,20 @@ void test_incsspq(int a) {
   // X86_64:       call void @llvm.x86.incsspq(i64 %{{[a-z0-9.]+}})
   _incsspq(a);
 }
+
+void test_inc_ssp(unsigned int a) {
+  // X86_64-LABEL: @test_inc_ssp
+  // X86_64:       call void @llvm.x86.incsspq(i64 %{{[a-z0-9.]+}})
+  _inc_ssp(a);
+}
+#else
+
+void test_inc_ssp(unsigned int a) {
+  // I386-LABEL: @test_inc_ssp
+  // I386:       call void @llvm.x86.incsspd(i32 %{{[0-9]+}})
+  _inc_ssp(a);
+}
+
 #endif
 
 unsigned int test_rdsspd(unsigned int a) {
@@ -29,6 +43,21 @@ unsigned long long test_rdsspq(unsigned long long a) {
   // X86_64:       call i64 @llvm.x86.rdsspq(i64 %{{[a-z0-9.]+}})
   return _rdsspq(a);
 }
+
+unsigned long long test_get_ssp(void) {
+  // X86_64-LABEL: @test_get_ssp
+  // X86_64:       call i64 @llvm.x86.rdsspq(i64 0)
+  return _get_ssp();
+}
+
+#else
+
+unsigned int test_get_ssp(void) {
+  // I386-LABEL: @test_get_ssp
+  // I386:       call i32 @llvm.x86.rdsspd(i32 0)
+  return _get_ssp();
+}
+
 #endif
 
 void  test_saveprevssp() {
diff --git a/test/CodeGen/cfi-check-fail2.c b/test/CodeGen/cfi-check-fail2.c
index 5340871c2ebe..9c4d0ac2ea36 100644
--- a/test/CodeGen/cfi-check-fail2.c
+++ b/test/CodeGen/cfi-check-fail2.c
@@ -3,6 +3,12 @@
 // RUN:     -fsanitize=cfi-vcall \
 // RUN:     -emit-llvm -o - %s | FileCheck %s
 
+// Check that blacklist does not affect generated code.
+// RUN: echo "src:*" > %t-all.blacklist
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O0 -fsanitize-cfi-cross-dso \
+// RUN:     -fsanitize=cfi-vcall -fsanitize-blacklist=%t-all.blacklist \
+// RUN:     -emit-llvm -o - %s | FileCheck %s
+
 void caller(void (*f)()) {
   f();
 }
diff --git a/test/CodeGen/cfi-icall-cross-dso.c b/test/CodeGen/cfi-icall-cross-dso.c
index 43ab0e73b14a..1df20aa1c8a3 100644
--- a/test/CodeGen/cfi-icall-cross-dso.c
+++ b/test/CodeGen/cfi-icall-cross-dso.c
@@ -39,6 +39,11 @@
 // MS-DIAG: call void @__cfi_slowpath_diag(i64 4195979634929632483, i8* %{{.*}}, {{.*}}@[[DATA]]{{.*}}) {{.*}}, !nosanitize
 // MS-TRAP: call void @__cfi_slowpath(i64 4195979634929632483, i8* %{{.*}}) {{.*}}, !nosanitize
 
+// ITANIUM-DIAG: declare void @__cfi_slowpath_diag(
+// ITANIUM-TRAP: declare void @__cfi_slowpath(
+// MS-DIAG: declare dso_local void @__cfi_slowpath_diag(
+// MS-TRAP: declare dso_local void @__cfi_slowpath(
+
 void caller(void (*f)()) {
   f();
 }
@@ -49,7 +54,7 @@ void caller(void (*f)()) {
 // CHECK: define internal void @g({{.*}} !type [[TVOID:![0-9]+]] !type [[TVOID_GENERALIZED:![0-9]+]] !type [[TVOID_ID:![0-9]+]]
 static void g(void) {}
 
-// CHECK: declare void @h({{[^!]*$}}
+// CHECK: declare {{(dso_local )?}}void @h({{[^!]*$}}
 void h(void);
 
 typedef void (*Fn)(void);
@@ -60,12 +65,15 @@ Fn h1() {
   return &h;
 }
 
-// CHECK: define void @bar({{.*}} !type [[TNOPROTO:![0-9]+]] !type [[TNOPROTO_GENERALIZED:![0-9]+]] !type [[TNOPROTO_ID:![0-9]+]]
+// CHECK: define {{(dso_local )?}}void @bar({{.*}} !type [[TNOPROTO:![0-9]+]] !type [[TNOPROTO_GENERALIZED:![0-9]+]] !type [[TNOPROTO_ID:![0-9]+]]
 // ITANIUM: define available_externally void @foo({{[^!]*$}}
-// MS: define linkonce_odr void @foo({{.*}} !type [[TNOPROTO]] !type [[TNOPROTO_GENERALIZED:![0-9]+]] !type [[TNOPROTO_ID]]
+// MS: define linkonce_odr dso_local void @foo({{.*}} !type [[TNOPROTO]] !type [[TNOPROTO_GENERALIZED:![0-9]+]] !type [[TNOPROTO_ID]]
 inline void foo() {}
 void bar() { foo(); }
 
+// ITANIUM: define weak void @__cfi_check
+// MS: define weak dso_local void @__cfi_check
+
 // CHECK: !{i32 4, !"Cross-DSO CFI", i32 1}
 
 // Check that the type entries are correct.
diff --git a/test/CodeGen/cfi-icall.c b/test/CodeGen/cfi-icall.c
index 5f346b66e81e..83fc8f89fbcf 100644
--- a/test/CodeGen/cfi-icall.c
+++ b/test/CodeGen/cfi-icall.c
@@ -3,20 +3,20 @@
 
 // Tests that we assign appropriate identifiers to unprototyped functions.
 
-// CHECK: define void @f({{.*}} !type [[TVOID:![0-9]+]] !type [[TVOID_GENERALIZED:![0-9]+]]
+// CHECK: define {{(dso_local )?}}void @f({{.*}} !type [[TVOID:![0-9]+]] !type [[TVOID_GENERALIZED:![0-9]+]]
 void f() {
 }
 
 void xf();
 
-// CHECK: define void @g({{.*}} !type [[TINT:![0-9]+]] !type [[TINT_GENERALIZED:![0-9]+]]
+// CHECK: define {{(dso_local )?}}void @g({{.*}} !type [[TINT:![0-9]+]] !type [[TINT_GENERALIZED:![0-9]+]]
 void g(int b) {
   void (*fp)() = b ? f : xf;
   // ITANIUM: call i1 @llvm.type.test(i8* {{.*}}, metadata !"_ZTSFvE")
   fp();
 }
 
-// CHECK: declare !type [[TVOID]] !type [[TVOID_GENERALIZED]] void @xf({{.*}}
+// CHECK: declare !type [[TVOID]] !type [[TVOID_GENERALIZED]] {{(dso_local )?}}void @xf({{.*}}
 
 // ITANIUM-DAG: [[TVOID]] = !{i64 0, !"_ZTSFvE"}
 // ITANIUM-DAG: [[TVOID_GENERALIZED]] = !{i64 0, !"_ZTSFvE.generalized"}
diff --git a/test/CodeGen/cfstring-windows.c b/test/CodeGen/cfstring-windows.c
index e54c86089078..fe7511eae141 100644
--- a/test/CodeGen/cfstring-windows.c
+++ b/test/CodeGen/cfstring-windows.c
@@ -31,8 +31,8 @@ __declspec(dllimport) long __CFConstantStringClassReference[];
 typedef struct __CFString *CFStringRef;
 const CFStringRef string = (CFStringRef)__builtin___CFStringMakeConstantString("string");
 
-// CHECK-CF-IN-CF-DECL: @__CFConstantStringClassReference = external dllexport global [0 x i32]
-// CHECK-CF-IN-CF-DEFN: @__CFConstantStringClassReference = common dllexport global [32 x i32]
+// CHECK-CF-IN-CF-DECL: @__CFConstantStringClassReference = external dso_local dllexport global [0 x i32]
+// CHECK-CF-IN-CF-DEFN: @__CFConstantStringClassReference = common dso_local dllexport global [32 x i32]
 // CHECK-CF: @__CFConstantStringClassReference = external dllimport global [0 x i32]
 // CHECK-CF-EXTERN: @__CFConstantStringClassReference = external dllimport global [0 x i32]
 // CHECK-CF-EXTERN-DLLIMPORT: @__CFConstantStringClassReference = external dllimport global [0 x i32]
diff --git a/test/CodeGen/clang-sections-attribute.c b/test/CodeGen/clang-sections-attribute.c
new file mode 100644
index 000000000000..7223d7d97a7a
--- /dev/null
+++ b/test/CodeGen/clang-sections-attribute.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -emit-llvm -triple arm-none-eabi -o - %s | FileCheck %s
+
+// Test interaction between __attribute__((section())) and '#pragma clang
+// section' directives. The attribute should always have higher priority than
+// the pragma.
+
+// Text tests.
+#pragma clang section text=".ext_fun_pragma"
+void ext_fun(void) __attribute__((section(".ext_fun_attr")));
+void ext_fun(void) {}
+#pragma clang section text=""
+
+void ext_fun2(void) __attribute__((section(".ext_fun2_attr")));
+#pragma clang section text=".ext_fun2_pragma"
+void ext_fun2(void) {}
+#pragma clang section text=""
+
+#pragma clang section text=".int_fun_pragma"
+static void int_fun(void) __attribute__((section(".int_fun_attr"), used));
+static void int_fun(void) {}
+#pragma clang section text=""
+
+static void int_fun2(void) __attribute__((section(".int_fun2_attr"), used));
+#pragma clang section text=".int_fun2_pragma"
+static void int_fun2(void) {}
+#pragma clang section text=""
+
+// Rodata tests.
+#pragma clang section rodata=".ext_const_pragma"
+__attribute__((section(".ext_const_attr")))
+const int ext_const = 1;
+#pragma clang section rodata=""
+
+#pragma clang section rodata=".int_const_pragma"
+__attribute__((section(".int_const_attr"), used))
+static const int int_const = 1;
+#pragma clang section rodata=""
+
+// Data tests.
+#pragma clang section data=".ext_var_pragma"
+__attribute__((section(".ext_var_attr")))
+int ext_var = 1;
+#pragma clang section data=""
+
+#pragma clang section data=".int_var_pragma"
+__attribute__((section(".int_var_attr"), used))
+static int int_var = 1;
+#pragma clang section data=""
+
+// Bss tests.
+#pragma clang section bss=".ext_zvar_pragma"
+__attribute__((section(".ext_zvar_attr")))
+int ext_zvar;
+#pragma clang section bss=""
+
+#pragma clang section bss=".int_zvar_pragma"
+__attribute__((section(".int_zvar_attr"), used))
+static int int_zvar;
+#pragma clang section bss=""
+
+// CHECK: @ext_const = constant i32 1, section ".ext_const_attr", align 4{{$}}
+// CHECK: @int_const = internal constant i32 1, section ".int_const_attr", align 4{{$}}
+// CHECK: @ext_var = global i32 1, section ".ext_var_attr", align 4{{$}}
+// CHECK: @int_var = internal global i32 1, section ".int_var_attr", align 4{{$}}
+// CHECK: @ext_zvar = global i32 0, section ".ext_zvar_attr", align 4{{$}}
+// CHECK: @int_zvar = internal global i32 0, section ".int_zvar_attr", align 4{{$}}
+// CHECK: define void @ext_fun() #0 section ".ext_fun_attr"
+// CHECK: define void @ext_fun2() #0 section ".ext_fun2_attr"
+// CHECK: define internal void @int_fun() #0 section ".int_fun_attr"
+// CHECK: define internal void @int_fun2() #0 section ".int_fun2_attr"
+//
+// Function attributes should not include implicit-section-name.
+// CHECK-NOT: attributes #0 = {{.*}}implicit-section-name
+//
+// No other attribute group should be present in the file.
+// CHECK-NOT: attributes #1
diff --git a/test/CodeGen/cldemote.c b/test/CodeGen/cldemote.c
new file mode 100644
index 000000000000..8c7bdf539333
--- /dev/null
+++ b/test/CodeGen/cldemote.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +cldemote -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +cldemote -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+void test_cldemote(const void *p) {
+  //CHECK-LABEL: @test_cldemote
+  //CHECK: call void @llvm.x86.cldemote(i8* %{{.*}})
+  _cldemote(p);
+}
diff --git a/test/CodeGen/code-coverage.c b/test/CodeGen/code-coverage.c
index 80307f49ef34..310d58544e41 100644
--- a/test/CodeGen/code-coverage.c
+++ b/test/CodeGen/code-coverage.c
@@ -2,7 +2,16 @@
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -femit-coverage-data -coverage-no-function-names-in-data %s -o - | FileCheck %s --check-prefix WITHOUTNAMES
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -femit-coverage-data -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -dwarf-column-info -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO
 
-// <rdar://problem/12843084>
+// RUN: %clang_cc1 -emit-llvm-bc -o /dev/null -fexperimental-new-pass-manager -fdebug-pass-manager -femit-coverage-data %s 2>&1 | FileCheck --check-prefix=NEWPM %s
+// RUN: %clang_cc1 -emit-llvm-bc -o /dev/null -fexperimental-new-pass-manager -fdebug-pass-manager -femit-coverage-data -O3 %s 2>&1 | FileCheck --check-prefix=NEWPM-O3 %s
+
+// NEWPM-NOT: Running pass
+// NEWPM: Running pass: GCOVProfilerPass
+
+// NEWPM-O3-NOT: Running pass
+// NEWPM-O3: Running pass: ForceFunctionAttrsPass
+// NEWPM-O3: Running pass: GCOVProfilerPass
+
 
 int test1(int a) {
   switch (a % 2) {
@@ -14,17 +23,25 @@ int test1(int a) {
   return a;
 }
 
+int test2(int b) {
+  return b * 2;
+}
+
+// Inside function emission data structure, check that
+// -coverage-no-function-names-in-data uses null as the function name.
+// CHECK: @__llvm_internal_gcov_emit_function_args.0 = internal unnamed_addr constant
+// CHECK-SAME: { i32 {{[0-9]+}}, i8* getelementptr inbounds ({{[^,]*}}, {{[^,]*}}* @
+// CHECK-SAME: { i32 {{[0-9]+}}, i8* getelementptr inbounds ({{[^,]*}}, {{[^,]*}}* @
+// WITHOUTNAMES: @__llvm_internal_gcov_emit_function_args.0 = internal unnamed_addr constant
+// WITHOUTNAMES-NOT: getelementptr inbounds ({{.*}}@
+// WITHOUTNAMES-SAME: zeroinitializer,
+// WITHOUTNAMES-NOT: getelementptr inbounds ({{.*}}@
+// WITHOUTNAMES-SAME: { i32 {{[0-9]+}}, i8* null,
+
 // Check that the noredzone flag is set on the generated functions.
 
 // CHECK: void @__llvm_gcov_indirect_counter_increment(i32* %{{.*}}, i64** %{{.*}}) unnamed_addr [[NRZ:#[0-9]+]]
-
-// Inside llvm_gcov_writeout, check that -coverage-no-function-names-in-data
-// passes null as the function name.
 // CHECK: void @__llvm_gcov_writeout() unnamed_addr [[NRZ]]
-// CHECK: call void @llvm_gcda_emit_function({{.*}}, i8* getelementptr {{.*}}, {{.*}})
-// WITHOUTNAMES: void @__llvm_gcov_writeout() unnamed_addr
-// WITHOUTNAMES: call void @llvm_gcda_emit_function({{.*}}, i8* null, {{.*}})
-
 // CHECK: void @__llvm_gcov_flush() unnamed_addr [[NRZ]]
 // CHECK: void @__llvm_gcov_init() unnamed_addr [[NRZ]]
 
diff --git a/test/CodeGen/compound-literal.c b/test/CodeGen/compound-literal.c
index 6507341ce2c1..38675a7dda22 100644
--- a/test/CodeGen/compound-literal.c
+++ b/test/CodeGen/compound-literal.c
@@ -33,7 +33,7 @@ void f() {
   // CHECK-NEXT: store i32 [[TMP]], i32* [[CY]]
   // CHECK-NEXT: [[SI8:%[a-zA-Z0-9.]+]] = bitcast [[STRUCT]]* [[S]] to i8*
   // CHECK-NEXT: [[COMPOUNDLITI8:%[a-zA-Z0-9.]+]] = bitcast [[STRUCT]]* [[COMPOUNDLIT]] to i8*
-  // CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* [[SI8]], i8* [[COMPOUNDLITI8]]
+  // CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[SI8]], i8* align {{[0-9]+}} [[COMPOUNDLITI8]]
   s = (S){s.y,s.x};
   // CHECK-NEXT: ret void
 }
@@ -67,7 +67,7 @@ struct G g(int x, int y, int z) {
 
   // CHECK-NEXT: [[T0:%.*]] = bitcast i48* [[COERCE_TEMP]] to i8*
   // CHECK-NEXT: [[T1:%.*]] = bitcast [[G]]* [[RESULT]] to i8*
-  // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 6
+  // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} [[T0]], i8* align {{[0-9]+}} [[T1]], i64 6
   // CHECK-NEXT: [[T0:%.*]] = load i48, i48* [[COERCE_TEMP]]
   // CHECK-NEXT: ret i48 [[T0]]
 }
diff --git a/test/CodeGen/constructor-attribute.c b/test/CodeGen/constructor-attribute.c
index a1f0e604d401..562e57b8b122 100644
--- a/test/CodeGen/constructor-attribute.c
+++ b/test/CodeGen/constructor-attribute.c
@@ -1,8 +1,39 @@
-// RUN: %clang_cc1 -emit-llvm -o %t %s
-// RUN: grep -e "global_ctors.*@A" %t
-// RUN: grep -e "global_dtors.*@B" %t
-// RUN: grep -e "global_ctors.*@C" %t
-// RUN: grep -e "global_dtors.*@D" %t
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=WITHOUTATEXIT %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fregister-global-dtors-with-atexit -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CXAATEXIT --check-prefix=WITHATEXIT %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fno-use-cxa-atexit -fregister-global-dtors-with-atexit -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ATEXIT --check-prefix=WITHATEXIT %s
+
+// WITHOUTATEXIT: global_ctors{{.*}}@A{{.*}}@C
+// WITHOUTATEXIT: @llvm.global_dtors = appending global [5 x { i32, void ()*, i8* }]{{.*}}@B{{.*}}@E{{.*}}@F{{.*}}@G{{.*}}@D
+// WITHATEXIT: @llvm.global_ctors = appending global [5 x { i32, void ()*, i8* }]{{.*}}i32 65535, void ()* @A,{{.*}}i32 65535, void ()* @C,{{.*}}i32 123, void ()* @__GLOBAL_init_123,{{.*}}i32 789, void ()* @[[GLOBAL_INIT_789:__GLOBAL_init_789.[0-9]+]],{{.*}}i32 65535, void ()* @__GLOBAL_init_65535,
+// WITHATEXIT-NOT: global_dtors
+
+// CHECK: define void @A()
+// CHECK: define void @B()
+// CHECK: define internal void @E()
+// CHECK: define internal void @F()
+// CHECK: define internal void @G()
+// CHECK: define i32 @__GLOBAL_init_789(i32 %{{.*}})
+// CHECK: define internal void @C()
+// CHECK: define internal void @D()
+// CHECK: define i32 @main()
+// CHECK: define internal i32 @foo()
+// WITHOUTATEXIT-NOT: define
+
+// WITHATEXIT: define internal void @__GLOBAL_init_123(){{.*}}section "__TEXT,__StaticInit,regular,pure_instructions"
+// CXAATEXIT: call i32 @__cxa_atexit(void (i8*)* bitcast (void ()* @E to void (i8*)*), i8* null, i8* @__dso_handle)
+// CXAATEXIT: call i32 @__cxa_atexit(void (i8*)* bitcast (void ()* @G to void (i8*)*), i8* null, i8* @__dso_handle)
+// ATEXIT: call i32 @atexit(void ()* @E)
+// ATEXIT: call i32 @atexit(void ()* @G)
+
+// WITHATEXIT: define internal void @[[GLOBAL_INIT_789]](){{.*}}section "__TEXT,__StaticInit,regular,pure_instructions"
+// CXAATEXIT: call i32 @__cxa_atexit(void (i8*)* bitcast (void ()* @F to void (i8*)*), i8* null, i8* @__dso_handle)
+// ATEXIT: call i32 @atexit(void ()* @F)
+
+// WITHATEXIT: define internal void @__GLOBAL_init_65535(){{.*}}section "__TEXT,__StaticInit,regular,pure_instructions"
+// CXAATEXIT: call i32 @__cxa_atexit(void (i8*)* bitcast (void ()* @B to void (i8*)*), i8* null, i8* @__dso_handle)
+// CXAATEXIT: call i32 @__cxa_atexit(void (i8*)* bitcast (void ()* @D to void (i8*)*), i8* null, i8* @__dso_handle)
+// ATEXIT: call i32 @atexit(void ()* @B)
+// ATEXIT: call i32 @atexit(void ()* @D)
 
 int printf(const char *, ...);
 
@@ -21,6 +52,21 @@ static void C() __attribute__((constructor));
 
 static void D() __attribute__((destructor));
 
+static __attribute__((destructor(123))) void E() {
+}
+
+static __attribute__((destructor(789))) void F() {
+}
+
+static __attribute__((destructor(123))) void G() {
+}
+
+// Test that this function doesn't collide with the synthesized constructor
+// function for destructors with priority 789.
+int __GLOBAL_init_789(int a) {
+  return a * a;
+}
+
 static int foo() {
   return 10;
 }
diff --git a/test/CodeGen/debug-info-block-out-return.c b/test/CodeGen/debug-info-block-out-return.c
index 428a50c77a06..931ea64d1383 100644
--- a/test/CodeGen/debug-info-block-out-return.c
+++ b/test/CodeGen/debug-info-block-out-return.c
@@ -14,7 +14,7 @@
 // CHECK: !DILocalVariable(name: ".block_descriptor", arg: 1,{{.*}}line: 2,
 // CHECK: !DILocalVariable(name: "param", arg: 2,{{.*}}line: 2,
 
-// Line directive so we don't have to worry about how many lines preceed the
+// Line directive so we don't have to worry about how many lines precede the
 // test code (as the line number is mangled in with the argument number as shown
 // above)
 #line 1
diff --git a/test/CodeGen/debug-info-cc.c b/test/CodeGen/debug-info-cc.c
new file mode 100644
index 000000000000..fe158550a627
--- /dev/null
+++ b/test/CodeGen/debug-info-cc.c
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=LINUX
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=WINDOWS
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=LINUX32
+// RUN: %clang_cc1 -triple armv7--linux-gnueabihf -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s --check-prefix=ARM
+
+//  enum CallingConv {
+//    CC_C,           // __attribute__((cdecl))
+//    CC_X86StdCall,  // __attribute__((stdcall))
+//    CC_X86FastCall, // __attribute__((fastcall))
+//    CC_X86ThisCall, // __attribute__((thiscall))
+//    CC_X86VectorCall, // __attribute__((vectorcall))
+//    CC_X86Pascal,   // __attribute__((pascal))
+//    CC_Win64,       // __attribute__((ms_abi))
+//    CC_X86_64SysV,  // __attribute__((sysv_abi))
+//    CC_X86RegCall, // __attribute__((regcall))
+//    CC_AAPCS,       // __attribute__((pcs("aapcs")))
+//    CC_AAPCS_VFP,   // __attribute__((pcs("aapcs-vfp")))
+//    CC_IntelOclBicc, // __attribute__((intel_ocl_bicc))
+//    CC_SpirFunction, // default for OpenCL functions on SPIR target
+//    CC_OpenCLKernel, // inferred for OpenCL kernels
+//    CC_Swift,        // __attribute__((swiftcall))
+//    CC_PreserveMost, // __attribute__((preserve_most))
+//    CC_PreserveAll,  // __attribute__((preserve_all))
+//  };
+
+#ifdef __x86_64__
+
+#ifdef __linux__
+// LINUX: !DISubprogram({{.*}}"add_msabi", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_Win64,
+__attribute__((ms_abi)) int add_msabi(int a, int b) {
+  return a+b;
+}
+
+// LINUX: !DISubprogram({{.*}}"add_regcall", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_X86RegCall,
+__attribute__((regcall)) int add_regcall(int a, int b) {
+  return a+b;
+}
+
+// LINUX: !DISubprogram({{.*}}"add_preserve_most", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_PreserveMost,
+__attribute__((preserve_most)) int add_preserve_most(int a, int b) {
+  return a+b;
+}
+
+// LINUX: !DISubprogram({{.*}}"add_preserve_all", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_PreserveAll,
+__attribute__((preserve_all)) int add_preserve_all(int a, int b) {
+  return a+b;
+}
+
+// LINUX: !DISubprogram({{.*}}"add_swiftcall", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_Swift,
+__attribute__((swiftcall)) int add_swiftcall(int a, int b) {
+  return a+b;
+}
+
+// LINUX: !DISubprogram({{.*}}"add_inteloclbicc", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_IntelOclBicc,
+__attribute__((intel_ocl_bicc)) int add_inteloclbicc(int a, int b) {
+  return a+b;
+}
+#endif
+
+#ifdef _WIN64
+// WINDOWS: !DISubprogram({{.*}}"add_sysvabi", {{.*}}type: ![[FTY:[0-9]+]]
+// WINDOWS: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_X86_64SysV,
+__attribute__((sysv_abi)) int add_sysvabi(int a, int b) {
+  return a+b;
+}
+#endif
+
+#endif
+
+#ifdef __i386__
+// LINUX32: !DISubprogram({{.*}}"add_stdcall", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX32: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_BORLAND_stdcall,
+__attribute__((stdcall)) int add_stdcall(int a, int b) {
+  return a+b;
+}
+
+// LINUX32: !DISubprogram({{.*}}"add_fastcall", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX32: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_BORLAND_msfastcall,
+__attribute__((fastcall)) int add_fastcall(int a, int b) {
+  return a+b;
+}
+
+// LINUX32: !DISubprogram({{.*}}"add_thiscall", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX32: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_BORLAND_thiscall,
+__attribute__((thiscall)) int add_thiscall(int a, int b) {
+  return a+b;
+}
+
+// LINUX32: !DISubprogram({{.*}}"add_vectorcall", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX32: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_vectorcall,
+__attribute__((vectorcall)) int add_vectorcall(int a, int b) {
+  return a+b;
+}
+
+// LINUX32: !DISubprogram({{.*}}"add_pascal", {{.*}}type: ![[FTY:[0-9]+]]
+// LINUX32: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_BORLAND_pascal,
+__attribute__((pascal)) int add_pascal(int a, int b) {
+  return a+b;
+}
+#endif
+
+#ifdef __arm__
+// ARM: !DISubprogram({{.*}}"add_aapcs", {{.*}}type: ![[FTY:[0-9]+]]
+// ARM: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_AAPCS,
+__attribute__((pcs("aapcs"))) int add_aapcs(int a, int b) {
+  return a+b;
+}
+
+// ARM: !DISubprogram({{.*}}"add_aapcs_vfp", {{.*}}type: ![[FTY:[0-9]+]]
+// ARM: ![[FTY]] = !DISubroutineType({{.*}}cc: DW_CC_LLVM_AAPCS_VFP,
+__attribute__((pcs("aapcs-vfp"))) int add_aapcs_vfp(int a, int b) {
+  return a+b;
+}
+#endif
diff --git a/test/CodeGen/debug-info-codeview-unnamed.c b/test/CodeGen/debug-info-codeview-unnamed.c
new file mode 100644
index 000000000000..bd2a7543e56b
--- /dev/null
+++ b/test/CodeGen/debug-info-codeview-unnamed.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -debug-info-kind=limited -S -emit-llvm -o - %s | FileCheck --check-prefix LINUX %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -S -emit-llvm -o - %s | FileCheck --check-prefix MSVC %s
+
+int main(int argc, char* argv[], char* arge[]) {
+
+  // In both DWARF and CodeView, an unnamed C structure type will generate a
+  // DICompositeType without a name or identifier attribute;
+  //
+  struct { int bar; } one = {42};
+  //
+  // LINUX:      !{{[0-9]+}} = !DILocalVariable(name: "one"
+  // LINUX-SAME:     type: [[TYPE_OF_ONE:![0-9]+]]
+  // LINUX-SAME:     )
+  // LINUX:      [[TYPE_OF_ONE]] = distinct !DICompositeType(
+  // LINUX-SAME:     tag: DW_TAG_structure_type
+  // LINUX-NOT:      name:
+  // LINUX-NOT:      identifier:
+  // LINUX-SAME:     )
+  //
+  // MSVC:       !{{[0-9]+}} = !DILocalVariable(name: "one"
+  // MSVC-SAME:      type: [[TYPE_OF_ONE:![0-9]+]]
+  // MSVC-SAME:      )
+  // MSVC:       [[TYPE_OF_ONE]] = distinct !DICompositeType
+  // MSVC-SAME:      tag: DW_TAG_structure_type
+  // MSVC-NOT:       name:
+  // MSVC-NOT:       identifier:
+  // MSVC-SAME:      )
+
+  return 0;
+}
diff --git a/test/CodeGen/debug-info-embed-source.c b/test/CodeGen/debug-info-embed-source.c
new file mode 100644
index 000000000000..3b607b61f7d7
--- /dev/null
+++ b/test/CodeGen/debug-info-embed-source.c
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1                -debug-info-kind=limited -emit-llvm %p/Inputs/debug-info-embed-source.c -o - | FileCheck %s --check-prefix=NOEMBED
+// RUN: %clang_cc1 -gembed-source -debug-info-kind=limited -emit-llvm %p/Inputs/debug-info-embed-source.c -o - | FileCheck %s --check-prefix=EMBED
+
+// NOEMBED-NOT: !DIFile({{.*}}source:
+// EMBED: !DIFile({{.*}}source: "void foo() { }\0A"
diff --git a/test/CodeGen/debug-info-enum.cpp b/test/CodeGen/debug-info-enum.cpp
new file mode 100644
index 000000000000..1237f28b9a55
--- /dev/null
+++ b/test/CodeGen/debug-info-enum.cpp
@@ -0,0 +1,100 @@
+// Test enumeration representation in debuig info metadata:
+// * test value representation for each possible underlying integer type
+// * test the integer type is as expected
+// * test the DW_AT_enum_class attribute is present (resp. absent) as expected.
+
+// RUN: %clang -target x86_64-linux -g -S -emit-llvm -o - %s | FileCheck %s
+
+
+enum class E0 : signed char {
+  A0 = -128,
+  B0 = 127,
+} x0;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E0"
+// CHECK-SAME: baseType: ![[SCHAR:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS0:[0-9]+]]
+// CHECK: ![[SCHAR]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
+// CHECK: ![[ELTS0]] = !{![[A0:[0-9]+]], ![[B0:[0-9]+]]}
+// CHECK: ![[A0]] = !DIEnumerator(name: "A0", value: -128)
+// CHECK: ![[B0]] = !DIEnumerator(name: "B0", value: 127)
+
+enum class E1 : unsigned char { A1 = 255 } x1;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E1"
+// CHECK-SAME: baseType: ![[UCHAR:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS1:[0-9]+]]
+// CHECK: ![[UCHAR]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+// CHECK: ![[ELTS1]] = !{![[A1:[0-9]+]]}
+// CHECK: ![[A1]] = !DIEnumerator(name: "A1", value: 255, isUnsigned: true)
+
+enum class E2 : signed short {
+  A2 = -32768,
+  B2 = 32767,
+} x2;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E2"
+// CHECK-SAME: baseType: ![[SHORT:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS2:[0-9]+]]
+// CHECK: ![[SHORT]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+// CHECK: ![[ELTS2]] = !{![[A2:[0-9]+]], ![[B2:[0-9]+]]}
+// CHECK: ![[A2]] = !DIEnumerator(name: "A2", value: -32768)
+// CHECK: ![[B2]] = !DIEnumerator(name: "B2", value: 32767)
+
+enum class E3 : unsigned short { A3 = 65535 } x3;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E3"
+// CHECK-SAME: baseType: ![[USHORT:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS3:[0-9]+]]
+// CHECK: ![[USHORT]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
+// CHECK: ![[ELTS3]] = !{![[A3:[0-9]+]]}
+// CHECK: ![[A3]] = !DIEnumerator(name: "A3", value: 65535, isUnsigned: true)
+
+enum class E4 : signed int { A4 = -2147483648, B4 = 2147483647 } x4;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E4"
+// CHECK-SAME: baseType: ![[INT:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS4:[0-9]+]]
+// CHECK: ![[INT]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+// CHECK: ![[ELTS4]] = !{![[A4:[0-9]+]], ![[B4:[0-9]+]]}
+// CHECK: ![[A4]] = !DIEnumerator(name: "A4", value: -2147483648)
+// CHECK: ![[B4]] = !DIEnumerator(name: "B4", value: 2147483647)
+
+enum class E5 : unsigned int { A5 = 4294967295 } x5;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E5"
+// CHECK-SAME: baseType: ![[UINT:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS5:[0-9]+]]
+// CHECK: ![[UINT]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+// CHECK: ![[ELTS5]] = !{![[A5:[0-9]+]]}
+// CHECK: ![[A5]] = !DIEnumerator(name: "A5", value: 4294967295, isUnsigned: true)
+
+enum class E6 : signed long long {
+  A6 = -9223372036854775807LL - 1,
+  B6 = 9223372036854775807LL
+} x6;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E6"
+// CHECK-SAME: baseType: ![[LONG:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS6:[0-9]+]]
+// CHECK: ![[LONG]] = !DIBasicType(name: "long long int", size: 64, encoding: DW_ATE_signed)
+// CHECK: ![[ELTS6]] = !{![[A6:[0-9]+]], ![[B6:[0-9]+]]}
+// CHECK: ![[A6]] = !DIEnumerator(name: "A6", value: -9223372036854775808)
+// CHECK: ![[B6]] = !DIEnumerator(name: "B6", value: 9223372036854775807)
+
+enum class E7 : unsigned long long { A7 = 18446744073709551615ULL } x7;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E7"
+// CHECK-SAME: baseType: ![[ULONG:[0-9]+]]
+// CHECK-SAME: DIFlagFixedEnum
+// CHECK-SAME: elements: ![[ELTS7:[0-9]+]]
+// CHECK: ![[ULONG]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+// CHECK: ![[ELTS7]] = !{![[A7:[0-9]+]]}
+// CHECK: ![[A7]] = !DIEnumerator(name: "A7", value: 18446744073709551615, isUnsigned: true)
+
+// Also test the FixedEnum flag is not present for old-style enumerations.
+enum E8 { A8 = -128, B8 = 127 } x8;
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E8"
+// CHECK-SAME: baseType: ![[INT]]
+// CHECK-NOT: DIFlagFixedEnum
+// CHECK: !DIEnumerator(name: "A8", value: -128)
+
diff --git a/test/CodeGen/debug-info-file-checksum.c b/test/CodeGen/debug-info-file-checksum.c
index 2750800d41ea..d644aac0614f 100644
--- a/test/CodeGen/debug-info-file-checksum.c
+++ b/test/CodeGen/debug-info-file-checksum.c
@@ -1,4 +1,5 @@
 // RUN: %clang -emit-llvm -S -g -gcodeview -x c %S/Inputs/debug-info-file-checksum.c -o - | FileCheck %s
+// RUN: %clang -emit-llvm -S -gdwarf-5 -x c %S/Inputs/debug-info-file-checksum.c -o - | FileCheck %s
 
 // Check that "checksum" is created correctly for the compiled file.
 
diff --git a/test/CodeGen/debug-info-inline-for.c b/test/CodeGen/debug-info-inline-for.c
new file mode 100644
index 000000000000..55066b28a1ff
--- /dev/null
+++ b/test/CodeGen/debug-info-inline-for.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
+// Check that clang emits Debug location in the phi instruction
+
+int func(int n) {
+  int a;
+  for(a = 10; a>0 && n++; a--);
+  return n;
+}
+
+// CHECK: land.end:
+// CHECK-NEXT: {{.*}} = phi i1 {{.*}} !dbg ![[DbgLoc:[0-9]+]]
+
+// CHECK: ![[DbgLoc]] = !DILocation(line: 0
diff --git a/test/CodeGen/debug-info-vla.c b/test/CodeGen/debug-info-vla.c
index 7928ca761397..8bd6a6d0acbe 100644
--- a/test/CodeGen/debug-info-vla.c
+++ b/test/CodeGen/debug-info-vla.c
@@ -2,9 +2,11 @@
 
 void testVLAwithSize(int s)
 {
-// CHECK: dbg.declare
-// CHECK: dbg.declare({{.*}}, metadata ![[VAR:.*]], metadata !DIExpression())
-// CHECK: ![[VAR]] = !DILocalVariable(name: "vla",{{.*}} line: [[@LINE+1]]
+// CHECK-DAG: dbg.declare({{.*}} %__vla_expr, metadata ![[VLAEXPR:[0-9]+]]
+// CHECK-DAG: dbg.declare({{.*}} %vla, metadata ![[VAR:[0-9]+]]
+// CHECK-DAG: ![[VLAEXPR]] = !DILocalVariable(name: "__vla_expr", {{.*}} flags: DIFlagArtificial
+// CHECK-DAG: ![[VAR]] = !DILocalVariable(name: "vla",{{.*}} line: [[@LINE+2]]
+// CHECK-DAG: !DISubrange(count: ![[VLAEXPR]])
   int vla[s];
   int i;
   for (i = 0; i < s; i++) {
diff --git a/test/CodeGen/decl.c b/test/CodeGen/decl.c
index 2065e3364bcd..d62629c7fb5c 100644
--- a/test/CodeGen/decl.c
+++ b/test/CodeGen/decl.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -w -emit-llvm < %s | FileCheck %s
+// RUN: %clang_cc1 -w -fmerge-all-constants -emit-llvm < %s | FileCheck %s
 
 // CHECK: @test1.x = internal constant [12 x i32] [i32 1
 // CHECK: @test2.x = private unnamed_addr constant [13 x i32] [i32 1,
-// CHECK: @test5w = global { i32, [4 x i8] } { i32 2, [4 x i8] undef }
-// CHECK: @test5y = global { double } { double 7.300000e+0{{[0]*}}1 }
+// CHECK: @test5w = {{(dso_local )?}}global { i32, [4 x i8] } { i32 2, [4 x i8] undef }
+// CHECK: @test5y = {{(dso_local )?}}global { double } { double 7.300000e+0{{[0]*}}1 }
 
 // CHECK: @test6.x = private unnamed_addr constant %struct.SelectDest { i8 1, i8 2, i32 3, i32 0 }
 
-// CHECK: @test7 = global [2 x %struct.test7s] [%struct.test7s { i32 1, i32 2 }, %struct.test7s { i32 4, i32 0 }]
+// CHECK: @test7 = {{(dso_local )?}}global [2 x %struct.test7s] [%struct.test7s { i32 1, i32 2 }, %struct.test7s { i32 4, i32 0 }]
 
 void test1() {
   // This should codegen as a "@test1.x" global.
diff --git a/test/CodeGen/default-address-space.c b/test/CodeGen/default-address-space.c
index b7f40585b267..5450609e06bb 100644
--- a/test/CodeGen/default-address-space.c
+++ b/test/CodeGen/default-address-space.c
@@ -1,39 +1,27 @@
-// RUN: %clang_cc1 -triple amdgcn -emit-llvm < %s | FileCheck -check-prefixes=PIZ,COM %s
 // RUN: %clang_cc1 -triple amdgcn---amdgiz -emit-llvm < %s | FileCheck -check-prefixes=CHECK,COM %s
 
-// PIZ-DAG: @foo = common addrspace(1) global i32 0
 // CHECK-DAG: @foo = common addrspace(1) global i32 0
 int foo;
 
-// PIZ-DAG: @ban = common addrspace(1) global [10 x i32] zeroinitializer
 // CHECK-DAG: @ban = common addrspace(1) global [10 x i32] zeroinitializer
 int ban[10];
 
-// PIZ-DAG: @A = common addrspace(1) global i32 addrspace(4)* null
-// PIZ-DAG: @B = common addrspace(1) global i32 addrspace(4)* null
 // CHECK-DAG: @A = common addrspace(1) global i32* null
 // CHECK-DAG: @B = common addrspace(1) global i32* null
 int *A;
 int *B;
 
 // COM-LABEL: define i32 @test1()
-// PIZ: load i32, i32 addrspace(4)* addrspacecast{{[^@]+}} @foo
 // CHECK: load i32, i32* addrspacecast{{[^@]+}} @foo
 int test1() { return foo; }
 
 // COM-LABEL: define i32 @test2(i32 %i)
 // COM: %[[addr:.*]] = getelementptr
-// PIZ: load i32, i32 addrspace(4)* %[[addr]]
-// PIZ-NEXT: ret i32
 // CHECK: load i32, i32* %[[addr]]
 // CHECK-NEXT: ret i32
 int test2(int i) { return ban[i]; }
 
 // COM-LABEL: define void @test3()
-// PIZ: load i32 addrspace(4)*, i32 addrspace(4)* addrspace(4)* addrspacecast{{[^@]+}} @B
-// PIZ: load i32, i32 addrspace(4)*
-// PIZ: load i32 addrspace(4)*, i32 addrspace(4)* addrspace(4)* addrspacecast{{[^@]+}} @A
-// PIZ: store i32 {{.*}}, i32 addrspace(4)*
 // CHECK: load i32*, i32** addrspacecast{{.*}} @B
 // CHECK: load i32, i32*
 // CHECK: load i32*, i32** addrspacecast{{.*}} @A
@@ -42,13 +30,6 @@ void test3() {
   *A = *B;
 }
 
-// PIZ-LABEL: define void @test4(i32 addrspace(4)* %a)
-// PIZ: %[[alloca:.*]] = alloca i32 addrspace(4)*
-// PIZ: %[[a_addr:.*]] = addrspacecast{{.*}} %[[alloca]] to i32 addrspace(4)* addrspace(4)*
-// PIZ: store i32 addrspace(4)* %a, i32 addrspace(4)* addrspace(4)* %[[a_addr]]
-// PIZ: %[[r0:.*]] = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(4)* %[[a_addr]]
-// PIZ: %[[arrayidx:.*]] = getelementptr inbounds i32, i32 addrspace(4)* %[[r0]]
-// PIZ: store i32 0, i32 addrspace(4)* %[[arrayidx]]
 // CHECK-LABEL: define void @test4(i32* %a)
 // CHECK: %[[alloca:.*]] = alloca i32*, align 8, addrspace(5)
 // CHECK: %[[a_addr:.*]] = addrspacecast{{.*}} %[[alloca]] to i32**
diff --git a/test/CodeGen/delete-null-pointer-checks.c b/test/CodeGen/delete-null-pointer-checks.c
new file mode 100644
index 000000000000..a3c1460f0a06
--- /dev/null
+++ b/test/CodeGen/delete-null-pointer-checks.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -emit-llvm -triple x86_64-unknown-linux-gnu -O2 -o - %s | FileCheck -check-prefix=NULL-POINTER-INVALID  %s
+// RUN: %clang_cc1 -emit-llvm -triple x86_64-unknown-linux-gnu -O2 -o - %s -fno-delete-null-pointer-checks | FileCheck -check-prefix=NULL-POINTER-VALID  %s
+
+// Test that clang does not remove the null pointer check with
+// -fno-delete-null-pointer-checks.
+int null_check(int *P) {
+// NULL-POINTER-VALID: %[[TOBOOL:.*]] = icmp eq i32* %P, null
+// NULL-POINTER-INVALID-NOT: icmp eq
+// NULL-POINTER-VALID: %[[SEL:.*]] = select i1 %[[TOBOOL:.*]], i32* null, i32*
+// NULL-POINTER-INVALID-NOT: select i1
+// NULL-POINTER-VALID: load i32, i32* %[[SEL:.*]]
+  int *Q = P;
+  if (P) {
+    Q = P + 2;
+  }
+  return *Q;
+}
+
+// NULL-POINTER-INVALID-NOT: attributes #0 = {{.*}} "null-pointer-is-valid"="true"
+// NULL-POINTER-VALID: attributes #0 = {{.*}} "null-pointer-is-valid"="true"
diff --git a/test/CodeGen/dllexport.c b/test/CodeGen/dllexport.c
index 17c2ce9e2755..f27012ec2355 100644
--- a/test/CodeGen/dllexport.c
+++ b/test/CodeGen/dllexport.c
@@ -14,24 +14,24 @@
 __declspec(dllexport) extern int ExternGlobalDecl;
 
 // dllexport implies a definition.
-// CHECK-DAG: @GlobalDef = common dllexport global i32 0, align 4
+// CHECK-DAG: @GlobalDef = common dso_local dllexport global i32 0, align 4
 __declspec(dllexport) int GlobalDef;
 
 // Export definition.
-// CHECK-DAG: @GlobalInit = dllexport global i32 1, align 4
+// CHECK-DAG: @GlobalInit = dso_local dllexport global i32 1, align 4
 __declspec(dllexport) int GlobalInit = 1;
 
 // Declare, then export definition.
-// CHECK-DAG: @GlobalDeclInit = dllexport global i32 1, align 4
+// CHECK-DAG: @GlobalDeclInit = dso_local dllexport global i32 1, align 4
 __declspec(dllexport) extern int GlobalDeclInit;
 int GlobalDeclInit = 1;
 
 // Redeclarations
-// CHECK-DAG: @GlobalRedecl1 = common dllexport global i32 0, align 4
+// CHECK-DAG: @GlobalRedecl1 = common dso_local dllexport global i32 0, align 4
 __declspec(dllexport) extern int GlobalRedecl1;
 __declspec(dllexport)        int GlobalRedecl1;
 
-// CHECK-DAG: @GlobalRedecl2 = common dllexport global i32 0, align 4
+// CHECK-DAG: @GlobalRedecl2 = common dso_local dllexport global i32 0, align 4
 __declspec(dllexport) extern int GlobalRedecl2;
                              int GlobalRedecl2;
 
@@ -44,22 +44,22 @@ __declspec(dllexport) extern int GlobalRedecl2;
 // Declarations are not exported.
 
 // Export function definition.
-// CHECK-DAG: define dllexport void @def()
+// CHECK-DAG: define dso_local dllexport void @def()
 __declspec(dllexport) void def(void) {}
 
 // Export inline function.
-// CHECK-DAG: define weak_odr dllexport void @inlineFunc()
-// CHECK-DAG: define weak_odr dllexport void @externInlineFunc()
+// CHECK-DAG: define weak_odr dso_local dllexport void @inlineFunc()
+// CHECK-DAG: define weak_odr dso_local dllexport void @externInlineFunc()
 __declspec(dllexport) inline void inlineFunc(void) {}
 __declspec(dllexport) inline void externInlineFunc(void) {}
 extern void externInlineFunc(void);
 
 // Redeclarations
-// CHECK-DAG: define dllexport void @redecl1()
+// CHECK-DAG: define dso_local dllexport void @redecl1()
 __declspec(dllexport) void redecl1(void);
 __declspec(dllexport) void redecl1(void) {}
 
-// CHECK-DAG: define dllexport void @redecl2()
+// CHECK-DAG: define dso_local dllexport void @redecl2()
 __declspec(dllexport) void redecl2(void);
                       void redecl2(void) {}
 
@@ -70,46 +70,46 @@ __declspec(dllexport) void redecl2(void);
 //===----------------------------------------------------------------------===//
 
 // dllexport takes precedence over the dllimport if both are specified.
-// CHECK-DAG: @PrecedenceGlobal1A = common dllexport global i32 0, align 4
-// CHECK-DAG: @PrecedenceGlobal1B = common dllexport global i32 0, align 4
+// CHECK-DAG: @PrecedenceGlobal1A = common dso_local dllexport global i32 0, align 4
+// CHECK-DAG: @PrecedenceGlobal1B = common dso_local dllexport global i32 0, align 4
 __attribute__((dllimport, dllexport))       int PrecedenceGlobal1A;
 __declspec(dllimport) __declspec(dllexport) int PrecedenceGlobal1B;
 
-// CHECK-DAG: @PrecedenceGlobal2A = common dllexport global i32 0, align 4
-// CHECK-DAG: @PrecedenceGlobal2B = common dllexport global i32 0, align 4
+// CHECK-DAG: @PrecedenceGlobal2A = common dso_local dllexport global i32 0, align 4
+// CHECK-DAG: @PrecedenceGlobal2B = common dso_local dllexport global i32 0, align 4
 __attribute__((dllexport, dllimport))       int PrecedenceGlobal2A;
 __declspec(dllexport) __declspec(dllimport) int PrecedenceGlobal2B;
 
-// CHECK-DAG: @PrecedenceGlobalRedecl1 = dllexport global i32 0, align 4
+// CHECK-DAG: @PrecedenceGlobalRedecl1 = dso_local dllexport global i32 0, align 4
 __declspec(dllexport) extern int PrecedenceGlobalRedecl1;
 __declspec(dllimport)        int PrecedenceGlobalRedecl1 = 0;
 
-// CHECK-DAG: @PrecedenceGlobalRedecl2 = common dllexport global i32 0, align 4
+// CHECK-DAG: @PrecedenceGlobalRedecl2 = common dso_local dllexport global i32 0, align 4
 __declspec(dllimport) extern int PrecedenceGlobalRedecl2;
 __declspec(dllexport)        int PrecedenceGlobalRedecl2;
 
-// CHECK-DAG: @PrecedenceGlobalMixed1 = dllexport global i32 1, align 4
+// CHECK-DAG: @PrecedenceGlobalMixed1 = dso_local dllexport global i32 1, align 4
 __attribute__((dllexport)) extern int PrecedenceGlobalMixed1;
 __declspec(dllimport)             int PrecedenceGlobalMixed1 = 1;
 
-// CHECK-DAG: @PrecedenceGlobalMixed2 = common dllexport global i32 0, align 4
+// CHECK-DAG: @PrecedenceGlobalMixed2 = common dso_local dllexport global i32 0, align 4
 __attribute__((dllimport)) extern int PrecedenceGlobalMixed2;
 __declspec(dllexport)             int PrecedenceGlobalMixed2;
 
-// CHECK-DAG: define dllexport void @precedence1A()
-// CHECK-DAG: define dllexport void @precedence1B()
+// CHECK-DAG: define dso_local dllexport void @precedence1A()
+// CHECK-DAG: define dso_local dllexport void @precedence1B()
 void __attribute__((dllimport, dllexport))       precedence1A(void) {}
 void __declspec(dllimport) __declspec(dllexport) precedence1B(void) {}
 
-// CHECK-DAG: define dllexport void @precedence2A()
-// CHECK-DAG: define dllexport void @precedence2B()
+// CHECK-DAG: define dso_local dllexport void @precedence2A()
+// CHECK-DAG: define dso_local dllexport void @precedence2B()
 void __attribute__((dllexport, dllimport))       precedence2A(void) {}
 void __declspec(dllexport) __declspec(dllimport) precedence2B(void) {}
 
-// CHECK-DAG: define dllexport void @precedenceRedecl1()
+// CHECK-DAG: define dso_local dllexport void @precedenceRedecl1()
 void __declspec(dllimport) precedenceRedecl1(void);
 void __declspec(dllexport) precedenceRedecl1(void) {}
 
-// CHECK-DAG: define dllexport void @precedenceRedecl2()
+// CHECK-DAG: define dso_local dllexport void @precedenceRedecl2()
 void __declspec(dllexport) precedenceRedecl2(void);
 void __declspec(dllimport) precedenceRedecl2(void) {}
diff --git a/test/CodeGen/dllimport.c b/test/CodeGen/dllimport.c
index f70048ebd1fb..61d695705261 100644
--- a/test/CodeGen/dllimport.c
+++ b/test/CodeGen/dllimport.c
@@ -39,14 +39,14 @@ USEVAR(GlobalRedecl2)
 
 // NB: MSVC issues a warning and makes GlobalRedecl3 dllexport. We follow GCC
 // and drop the dllimport with a warning.
-// CHECK: @GlobalRedecl3 = external global i32
+// CHECK: @GlobalRedecl3 = external dso_local global i32
 __declspec(dllimport) extern int GlobalRedecl3;
                       extern int GlobalRedecl3; // dllimport ignored
 USEVAR(GlobalRedecl3)
 
 // Make sure this works even if the decl has been used before it's defined (PR20792).
-// MS: @GlobalRedecl4 = common dllexport global i32
-// GNU: @GlobalRedecl4 = common global i32
+// MS: @GlobalRedecl4 = common dso_local dllexport global i32
+// GNU: @GlobalRedecl4 = common dso_local global i32
 __declspec(dllimport) extern int GlobalRedecl4;
 USEVAR(GlobalRedecl4)
                       int GlobalRedecl4; // dllimport ignored
@@ -76,22 +76,22 @@ int functionScope() {
 __declspec(dllimport) void decl(void);
 
 // Initialize use_decl with the address of the thunk.
-// CHECK-DAG: @use_decl = global void ()* @decl
+// CHECK-DAG: @use_decl = dso_local global void ()* @decl
 void (*use_decl)(void) = &decl;
 
 // Import inline function.
 // MS-DAG: declare dllimport void @inlineFunc()
 // MO1-DAG: define available_externally dllimport void @inlineFunc()
-// GNU-DAG: declare void @inlineFunc()
-// GO1-DAG: define available_externally void @inlineFunc()
+// GNU-DAG: declare dso_local void @inlineFunc()
+// GO1-DAG: define available_externally dso_local void @inlineFunc()
 __declspec(dllimport) inline void inlineFunc(void) {}
 USE(inlineFunc)
 
 // inline attributes
 // MS-DAG: declare dllimport void @noinline()
 // MO1-DAG: define available_externally dllimport void @noinline()
-// GNU-DAG: declare void @noinline()
-// GO1-DAG: define available_externally void @noinline()
+// GNU-DAG: declare dso_local void @noinline()
+// GO1-DAG: define available_externally dso_local void @noinline()
 // CHECK-NOT: @alwaysInline()
 // O1-NOT: @alwaysInline()
 __declspec(dllimport) __attribute__((noinline)) inline void noinline(void) {}
@@ -107,20 +107,20 @@ USE(redecl1)
 
 // NB: MSVC issues a warning and makes redecl2/redecl3 dllexport. We follow GCC
 // and drop the dllimport with a warning.
-// CHECK-DAG: declare void @redecl2()
+// CHECK-DAG: declare dso_local void @redecl2()
 __declspec(dllimport) void redecl2(void);
                       void redecl2(void);
 USE(redecl2)
 
-// MS: define dllexport void @redecl3()
-// GNU: define void @redecl3()
+// MS: define dso_local dllexport void @redecl3()
+// GNU: define dso_local void @redecl3()
 __declspec(dllimport) void redecl3(void);
                       void redecl3(void) {} // dllimport ignored
 USE(redecl3)
 
 // Make sure this works even if the decl is used before it's defined (PR20792).
-// MS: define dllexport void @redecl4()
-// GNU: define void @redecl4()
+// MS: define dso_local dllexport void @redecl4()
+// GNU: define dso_local void @redecl4()
 __declspec(dllimport) void redecl4(void);
 USE(redecl4)
                       void redecl4(void) {} // dllimport ignored
diff --git a/test/CodeGen/dso-local-executable.c b/test/CodeGen/dso-local-executable.c
new file mode 100644
index 000000000000..6e434a67effb
--- /dev/null
+++ b/test/CodeGen/dso-local-executable.c
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -triple x86_64-pc-win32 -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=COFF %s
+// COFF-DAG: @bar = external dso_local global i32
+// COFF-DAG: @weak_bar = extern_weak dso_local global i32
+// COFF-DAG: declare dso_local void @foo()
+// COFF-DAG: @baz = dso_local global i32 42
+// COFF-DAG: define dso_local i32* @zed()
+// COFF-DAG: @thread_var = external dso_local thread_local global i32
+// COFF-DAG: @local_thread_var = dso_local thread_local global i32 42
+// COFF-DAG: @import_var = external dllimport global i32
+// COFF-DAG: declare dllimport void @import_func()
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -emit-llvm -mrelocation-model static %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=STATIC %s
+// STATIC-DAG: @bar = external dso_local global i32
+// STATIC-DAG: @weak_bar = extern_weak dso_local global i32
+// STATIC-DAG: declare dso_local void @foo()
+// STATIC-DAG: @baz = dso_local global i32 42
+// STATIC-DAG: define dso_local i32* @zed()
+// STATIC-DAG: @thread_var = external thread_local global i32
+// STATIC-DAG: @local_thread_var = dso_local thread_local global i32 42
+// STATIC-DAG: @import_var = external dso_local global i32
+// STATIC-DAG: declare dso_local void @import_func()
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -emit-llvm -pic-is-pie -mpie-copy-relocations %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=PIE-COPY %s
+// PIE-COPY-DAG: @bar = external dso_local global i32
+// PIE-COPY-DAG: @weak_bar = extern_weak global i32
+// PIE-COPY-DAG: declare void @foo()
+// PIE-COPY-DAG: @baz = dso_local global i32 42
+// PIE-COPY-DAG: define dso_local i32* @zed()
+// PIE-COPY-DAG: @thread_var = external thread_local global i32
+// PIE-COPY-DAG: @local_thread_var = dso_local thread_local global i32 42
+// PIE-COPY-DAG: @import_var = external dso_local global i32
+// PIE-COPY-DAG: declare void @import_func()
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -emit-llvm -pic-is-pie %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=PIE %s
+// PIE-DAG: @bar = external global i32
+// PIE-DAG: @weak_bar = extern_weak global i32
+// PIE-DAG: declare void @foo()
+// PIE-DAG: @baz = dso_local global i32 42
+// PIE-DAG: define dso_local i32* @zed()
+// PIE-DAG: @thread_var = external thread_local global i32
+// PIE-DAG: @local_thread_var = dso_local thread_local global i32 42
+// PIE-DAG: @import_var = external global i32
+// PIE-DAG: declare void @import_func()
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -emit-llvm -mrelocation-model static -fno-plt %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=NOPLT %s
+// NOPLT-DAG: @bar = external dso_local global i32
+// NOPLT-DAG: @weak_bar = extern_weak dso_local global i32
+// NOPLT-DAG: declare void @foo()
+// NOPLT-DAG: @baz = dso_local global i32 42
+// NOPLT-DAG: define dso_local i32* @zed()
+// NOPLT-DAG: @thread_var = external thread_local global i32
+// NOPLT-DAG: @local_thread_var = dso_local thread_local global i32 42
+// NOPLT-DAG: @import_var = external dso_local global i32
+// NOPLT-DAG: declare void @import_func()
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -emit-llvm -fno-plt -pic-is-pie -mpie-copy-relocations %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=PIE-COPY-NOPLT %s
+// PIE-COPY-NOPLT-DAG: @bar = external dso_local global i32
+// PIE-COPY-NOPLT-DAG: @weak_bar = extern_weak global i32
+// PIE-COPY-NOPLT-DAG: declare void @foo()
+// PIE-COPY-NOPLT-DAG: @baz = dso_local global i32 42
+// PIE-COPY-NOPLT-DAG: define dso_local i32* @zed()
+// PIE-COPY-NOPLT-DAG: @thread_var = external thread_local global i32
+// PIE-COPY-NOPLT-DAG: @local_thread_var = dso_local thread_local global i32 42
+// PIE-COPY-NOPLT-DAG: @import_var = external dso_local global i32
+// PIE-COPY-NOPLT-DAG: declare void @import_func()
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -emit-llvm -pic-is-pie -fno-plt %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=PIE-NO-PLT %s
+// RUN: %clang_cc1 -triple powerpc64le-pc-linux -emit-llvm -mrelocation-model static %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=PIE-NO-PLT %s
+// PIE-NO-PLT-DAG: @bar = external global i32
+// PIE-NO-PLT-DAG: @weak_bar = extern_weak global i32
+// PIE-NO-PLT-DAG: declare void @foo()
+// PIE-NO-PLT-DAG: @baz = dso_local global i32 42
+// PIE-NO-PLT-DAG: define dso_local i32* @zed()
+// PIE-NO-PLT-DAG: @thread_var = external thread_local global i32
+// PIE-NO-PLT-DAG: @local_thread_var = dso_local thread_local global i32 42
+// PIE-NO-PLT-DAG: @import_var = external global i32
+// PIE-NO-PLT-DAG: declare void @import_func()
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix=SHARED %s
+// SHARED-DAG: @bar = external global i32
+// SHARED-DAG: @weak_bar = extern_weak global i32
+// SHARED-DAG: declare void @foo()
+// SHARED-DAG: @baz = global i32 42
+// SHARED-DAG: define i32* @zed()
+// SHARED-DAG: @thread_var = external thread_local global i32
+// SHARED-DAG: @local_thread_var = thread_local global i32 42
+// PIE-NO-PLT-DAG: @import_var = external global i32
+// PIE-NO-PLT-DAG: declare void @import_func()
+
+__attribute__((dllimport)) extern int import_var;
+__attribute__((dllimport)) void import_func(void);
+
+int *use_import() {
+  import_func();
+  return &import_var;
+}
+
+extern int bar;
+__attribute__((weak)) extern int weak_bar;
+void foo(void);
+
+int baz = 42;
+int *zed() {
+  foo();
+  return baz ? &weak_bar : &bar;
+}
+
+extern __thread int thread_var;
+__thread int local_thread_var = 42;
+int *get_thread_var(int a) {
+  return a ? &thread_var : &local_thread_var;
+}
diff --git a/test/CodeGen/dump-struct-builtin.c b/test/CodeGen/dump-struct-builtin.c
new file mode 100644
index 000000000000..cd8540960205
--- /dev/null
+++ b/test/CodeGen/dump-struct-builtin.c
@@ -0,0 +1,555 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/stdio.h"
+#include <stdint.h>
+
+// CHECK: @unit1.a = private unnamed_addr constant %struct.U1A { i16 12 }, align 2
+// CHECK-NEXT: [[STRUCT_STR_U1:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U1A {\0A\00"
+// CHECK-NEXT: [[FIELD_U1:@[0-9]+]] = private unnamed_addr constant [11 x i8] c"short a : \00"
+// CHECK-NEXT: [[FORMAT_U1:@[0-9]+]] = private unnamed_addr constant [5 x i8] c"%hd\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U1:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit2.a = private unnamed_addr constant %struct.U2A { i16 12 }, align 2
+// CHECK-NEXT: [[STRUCT_STR_U2:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U2A {\0A\00"
+// CHECK-NEXT: [[FIELD_U2:@[0-9]+]] = private unnamed_addr constant [20 x i8] c"unsigned short a : \00"
+// CHECK-NEXT: [[FORMAT_U2:@[0-9]+]] = private unnamed_addr constant [5 x i8] c"%hu\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U2:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit3.a = private unnamed_addr constant %struct.U3A { i32 12 }, align 4
+// CHECK-NEXT: [[STRUCT_STR_U3:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U3A {\0A\00"
+// CHECK-NEXT: [[FIELD_U3:@[0-9]+]] = private unnamed_addr constant [9 x i8] c"int a : \00"
+// CHECK-NEXT: [[FORMAT_U3:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%d\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U3:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit4.a = private unnamed_addr constant %struct.U4A { i32 12 }, align 4
+// CHECK-NEXT: [[STRUCT_STR_U4:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U4A {\0A\00"
+// CHECK-NEXT: [[FIELD_U4:@[0-9]+]] = private unnamed_addr constant [18 x i8] c"unsigned int a : \00"
+// CHECK-NEXT: [[FORMAT_U4:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%u\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U4:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit5.a = private unnamed_addr constant %struct.U5A { i64 12 }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U5:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U5A {\0A\00"
+// CHECK-NEXT: [[FIELD_U5:@[0-9]+]] = private unnamed_addr constant [10 x i8] c"long a : \00"
+// CHECK-NEXT: [[FORMAT_U5:@[0-9]+]] = private unnamed_addr constant [5 x i8] c"%ld\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U5:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit6.a = private unnamed_addr constant %struct.U6A { i64 12 }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U6:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U6A {\0A\00"
+// CHECK-NEXT: [[FIELD_U6:@[0-9]+]] = private unnamed_addr constant [19 x i8] c"unsigned long a : \00"
+// CHECK-NEXT: [[FORMAT_U6:@[0-9]+]] = private unnamed_addr constant [5 x i8] c"%lu\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U6:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit7.a = private unnamed_addr constant %struct.U7A { i64 12 }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U7:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U7A {\0A\00"
+// CHECK-NEXT: [[FIELD_U7:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"long long a : \00"
+// CHECK-NEXT: [[FORMAT_U7:@[0-9]+]] = private unnamed_addr constant [6 x i8] c"%lld\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U7:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit8.a = private unnamed_addr constant %struct.U8A { i64 12 }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U8:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U8A {\0A\00"
+// CHECK-NEXT: [[FIELD_U8:@[0-9]+]] = private unnamed_addr constant [24 x i8] c"unsigned long long a : \00"
+// CHECK-NEXT: [[FORMAT_U8:@[0-9]+]] = private unnamed_addr constant [6 x i8] c"%llu\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U8:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit9.a = private unnamed_addr constant %struct.U9A { i8 97 }, align 1
+// CHECK-NEXT: [[STRUCT_STR_U9:@[0-9]+]] = private unnamed_addr constant [14 x i8] c"struct U9A {\0A\00"
+// CHECK-NEXT: [[FIELD_U9:@[0-9]+]] = private unnamed_addr constant [10 x i8] c"char a : \00"
+// CHECK-NEXT: [[FORMAT_U9:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%c\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U9:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @.str = private unnamed_addr constant [4 x i8] c"LSE\00", align 1
+
+// CHECK: @unit10.a = private unnamed_addr constant %struct.U10A { i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0) }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U10:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U10A {\0A\00"
+// CHECK-NEXT: [[FIELD_U10:@[0-9]+]] = private unnamed_addr constant [12 x i8] c"char * a : \00"
+// CHECK-NEXT: [[FORMAT_U10:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%s\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U10:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit11.a = private unnamed_addr constant %struct.U11A { i8* inttoptr (i64 305419896 to i8*) }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U11:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U11A {\0A\00"
+// CHECK-NEXT: [[FIELD_U11:@[0-9]+]] = private unnamed_addr constant [12 x i8] c"void * a : \00"
+// CHECK-NEXT: [[FORMAT_U11:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%p\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U11:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit12.a = private unnamed_addr constant %struct.U12A { i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0) }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U12:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U12A {\0A\00"
+// CHECK-NEXT: [[FIELD_U12:@[0-9]+]] = private unnamed_addr constant [18 x i8] c"const char * a : \00"
+// CHECK-NEXT: [[FORMAT_U12:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%s\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U12:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit13.a = private unnamed_addr constant %struct.U13A { i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0) }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U13:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U13A {\0A\00"
+// CHECK-NEXT: [[FIELD_U13:@[0-9]+]] = private unnamed_addr constant [20 x i8] c"const charstar a : \00"
+// CHECK-NEXT: [[FORMAT_U13:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%s\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U13:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit14.a = private unnamed_addr constant %struct.U14A { double 0x3FF1F9ACFFA7EB6C }, align 8
+// CHECK-NEXT: [[STRUCT_STR_U14:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U14A {\0A\00"
+// CHECK-NEXT: [[FIELD_U14:@[0-9]+]] = private unnamed_addr constant [12 x i8] c"double a : \00"
+// CHECK-NEXT: [[FORMAT_U14:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%f\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U14:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit15.a = private unnamed_addr constant %struct.U15A { [3 x i32] [i32 1, i32 2, i32 3] }, align 4
+// CHECK-NEXT: [[STRUCT_STR_U15:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U15A {\0A\00"
+// CHECK-NEXT: [[FIELD_U15:@[0-9]+]] = private unnamed_addr constant [13 x i8] c"int [3] a : \00"
+// CHECK-NEXT: [[FORMAT_U15:@[0-9]+]] = private unnamed_addr constant [4 x i8] c"%p\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U15:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit16.a = private unnamed_addr constant %struct.U16A { i8 12 }, align 1
+// CHECK-NEXT: [[STRUCT_STR_U16:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U16A {\0A\00"
+// CHECK-NEXT: [[FIELD_U16:@[0-9]+]] = private unnamed_addr constant [13 x i8] c"uint8_t a : \00"
+// CHECK-NEXT: [[FORMAT_U16:@[0-9]+]] = private unnamed_addr constant [6 x i8] c"%hhu\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U16:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit17.a = private unnamed_addr constant %struct.U17A { i8 12 }, align 1
+// CHECK-NEXT: [[STRUCT_STR_U17:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U17A {\0A\00"
+// CHECK-NEXT: [[FIELD_U17:@[0-9]+]] = private unnamed_addr constant [12 x i8] c"int8_t a : \00"
+// CHECK-NEXT: [[FORMAT_U17:@[0-9]+]] = private unnamed_addr constant [6 x i8] c"%hhd\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U17:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+// CHECK: @unit18.a = private unnamed_addr constant %struct.U18A { x86_fp80 0xK3FFF8FCD67FD3F5B6000 }, align 16
+// CHECK-NEXT: [[STRUCT_STR_U18:@[0-9]+]] = private unnamed_addr constant [15 x i8] c"struct U18A {\0A\00"
+// CHECK-NEXT: [[FIELD_U18:@[0-9]+]] = private unnamed_addr constant [17 x i8] c"long double a : \00"
+// CHECK-NEXT: [[FORMAT_U18:@[0-9]+]] = private unnamed_addr constant [5 x i8] c"%Lf\0A\00"
+// CHECK-NEXT: [[END_STRUCT_U18:@[0-9]+]] = private unnamed_addr constant [3 x i8] c"}\0A\00"
+
+int printf(const char *fmt, ...) {
+    return 0;
+}
+
+void unit1() {
+  struct U1A {
+    short a;
+  };
+
+  struct U1A a = {
+      .a = 12,
+  };
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U1]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U1A, %struct.U1A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* [[FIELD_U1]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i16, i16* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* [[FORMAT_U1]], i32 0, i32 0), i16 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U1]], i32 0, i32 0))
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit2() {
+  struct U2A {
+    unsigned short a;
+  };
+
+  struct U2A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U2]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U2A, %struct.U2A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* [[FIELD_U2]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i16, i16* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* [[FORMAT_U2]], i32 0, i32 0), i16 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U2]], i32 0, i32 0))
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit3() {
+  struct U3A {
+    int a;
+  };
+
+  struct U3A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U3]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U3A, %struct.U3A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* [[FIELD_U3]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U3]], i32 0, i32 0), i32 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U3]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit4() {
+  struct U4A {
+    unsigned int a;
+  };
+
+  struct U4A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U4]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U4A, %struct.U4A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* [[FIELD_U4]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U4]], i32 0, i32 0), i32 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U4]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit5() {
+  struct U5A {
+    long a;
+  };
+
+  struct U5A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U5]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U5A, %struct.U5A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* [[FIELD_U5]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* [[FORMAT_U5]], i32 0, i32 0), i64 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U5]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit6() {
+  struct U6A {
+    unsigned long a;
+  };
+
+  struct U6A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U6]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U6A, %struct.U6A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* [[FIELD_U6]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* [[FORMAT_U6]], i32 0, i32 0), i64 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U6]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit7() {
+  struct U7A {
+    long long a;
+  };
+
+  struct U7A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U7]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U7A, %struct.U7A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[FIELD_U7]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* [[FORMAT_U7]], i32 0, i32 0), i64 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U7]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit8() {
+  struct U8A {
+    unsigned long long a;
+  };
+
+  struct U8A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U8]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U8A, %struct.U8A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* [[FIELD_U8]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* [[FORMAT_U8]], i32 0, i32 0), i64 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U8]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit9() {
+  struct U9A {
+    char a;
+  };
+
+  struct U9A a = {
+      .a = 'a',
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* [[STRUCT_STR_U9]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U9A, %struct.U9A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* [[FIELD_U9]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U9]], i32 0, i32 0), i8 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U9]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit10() {
+  struct U10A {
+    char *a;
+  };
+
+  struct U10A a = {
+      .a = "LSE",
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U10]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U10A, %struct.U10A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* [[FIELD_U10]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8*, i8** [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U10]], i32 0, i32 0), i8* [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U10]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit11() {
+  struct U11A {
+    void *a;
+  };
+
+  struct U11A a = {
+      .a = (void *)0x12345678,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U11]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U11A, %struct.U11A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* [[FIELD_U11]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8*, i8** [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U11]], i32 0, i32 0), i8* [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U11]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit12() {
+  struct U12A {
+    const char *a;
+  };
+
+  struct U12A a = {
+      .a = "LSE",
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U12]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U12A, %struct.U12A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* [[FIELD_U12]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8*, i8** [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U12]], i32 0, i32 0), i8* [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U12]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit13() {
+  typedef char *charstar;
+  struct U13A {
+    const charstar a;
+  };
+
+  struct U13A a = {
+      .a = "LSE",
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U13]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U13A, %struct.U13A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* [[FIELD_U13]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8*, i8** [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U13]], i32 0, i32 0), i8* [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U13]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit14() {
+  struct U14A {
+    double a;
+  };
+
+  struct U14A a = {
+      .a = 1.123456,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U14]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U14A, %struct.U14A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* [[FIELD_U14]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load double, double* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U14]], i32 0, i32 0), double [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U14]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit15() {
+  struct U15A {
+    int a[3];
+  };
+
+  struct U15A a = {
+      .a = {1, 2, 3},
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U15]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U15A, %struct.U15A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* [[FIELD_U15]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load [3 x i32], [3 x i32]* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* [[FORMAT_U15]], i32 0, i32 0), [3 x i32] [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U15]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit16() {
+  struct U16A {
+    uint8_t a;
+  };
+
+  struct U16A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U16]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U16A, %struct.U16A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* [[FIELD_U16]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* [[FORMAT_U16]], i32 0, i32 0), i8 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U16]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit17() {
+  struct U17A {
+    int8_t a;
+  };
+
+  struct U17A a = {
+      .a = 12,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U17]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U17A, %struct.U17A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* [[FIELD_U17]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* [[FORMAT_U17]], i32 0, i32 0), i8 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U17]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void unit18() {
+  struct U18A {
+    long double a;
+  };
+
+  struct U18A a = {
+      .a = 1.123456,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* [[STRUCT_STR_U18]], i32 0, i32 0))
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.U18A, %struct.U18A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* [[FIELD_U18]], i32 0, i32 0))
+  // CHECK: [[LOAD1:%[0-9]+]] = load x86_fp80, x86_fp80* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* [[FORMAT_U18]], i32 0, i32 0), x86_fp80 [[LOAD1]])
+  // CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* [[END_STRUCT_U18]], i32 0, i32 0)
+  __builtin_dump_struct(&a, &printf);
+}
+
+void test1() {
+  struct T1A {
+    int a;
+    char *b;
+  };
+
+  struct T1A a = {
+      .a = 12,
+      .b = "LSE",
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.T1A, %struct.T1A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, i32 [[LOAD1]])
+  // CHECK: [[RES2:%[0-9]+]] = getelementptr inbounds %struct.T1A, %struct.T1A* %a, i32 0, i32 1
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[LOAD2:%[0-9]+]] = load i8*, i8** [[RES2]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, i8* [[LOAD2]])
+  // CHECK: call i32 (i8*, ...) @printf(
+  __builtin_dump_struct(&a, &printf);
+}
+
+void test2() {
+  struct T2A {
+    int a;
+  };
+
+  struct T2B {
+    int b;
+    struct T2A a;
+  };
+
+  struct T2B b = {
+      .b = 24,
+      .a = {
+          .a = 12,
+      }
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.T2B, %struct.T2B* %b, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* [[RES1]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, i32 [[LOAD1]])
+  // CHECK: [[NESTED_STRUCT:%[0-9]+]] = getelementptr inbounds %struct.T2B, %struct.T2B* %b, i32 0, i32 1
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[RES2:%[0-9]+]] = getelementptr inbounds %struct.T2A, %struct.T2A* [[NESTED_STRUCT]], i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[LOAD2:%[0-9]+]] = load i32, i32* [[RES2]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, i32 [[LOAD2]])
+  // CHECK: call i32 (i8*, ...) @printf(
+  __builtin_dump_struct(&b, &printf);
+}
+
+void test3() {
+  struct T3A {
+    union {
+      int a;
+      char b[4];
+    };
+  };
+
+  struct T3A a = {
+      .a = 42,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.T3A, %struct.T3A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[BC1:%[0-9]+]] = bitcast %union.anon* [[RES1]] to i32*
+  // CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* [[BC1]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, i32 [[LOAD1]])
+  // CHECK: [[BC2:%[0-9]+]] = bitcast %union.anon* [[RES1]] to [4 x i8]*
+  // CHECK: [[LOAD2:%[0-9]+]] = load [4 x i8], [4 x i8]* [[BC2]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, [4 x i8] [[LOAD2]])
+  // CHECK: call i32 (i8*, ...) @printf(
+  __builtin_dump_struct(&a, &printf);
+}
+
+void test4() {
+  struct T4A {
+    union {
+      struct {
+        void *a;
+      };
+      struct {
+        unsigned long b;
+      };
+    };
+  };
+
+  struct T4A a = {
+      .a = (void *)0x12345678,
+  };
+
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[RES1:%[0-9]+]] = getelementptr inbounds %struct.T4A, %struct.T4A* %a, i32 0, i32 0
+  // CHECK: call i32 (i8*, ...) @printf(
+  // CHECK: [[BC1:%[0-9]+]] = bitcast %union.anon.0* [[RES1]] to %struct.anon*
+  // CHECK: [[RES2:%[0-9]+]] = getelementptr inbounds %struct.anon, %struct.anon* [[BC1]], i32 0, i32 0
+  // CHECK: [[LOAD1:%[0-9]+]] = load i8*, i8** [[RES2]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, i8* [[LOAD1]])
+
+  // CHECK: [[BC2:%[0-9]+]] = bitcast %union.anon.0* [[RES1]] to %struct.anon.1*
+  // CHECK: [[RES3:%[0-9]+]] = getelementptr inbounds %struct.anon.1, %struct.anon.1* [[BC2]], i32 0, i32 0
+  // CHECK: [[LOAD2:%[0-9]+]] = load i64, i64* [[RES3]],
+  // CHECK: call i32 (i8*, ...) @printf({{.*}}, i64 [[LOAD2]])
+  // CHECK: call i32 (i8*, ...) @printf(
+  __builtin_dump_struct(&a, &printf);
+}
diff --git a/test/CodeGen/elf-linker-options.c b/test/CodeGen/elf-linker-options.c
new file mode 100644
index 000000000000..cf2d1b92b998
--- /dev/null
+++ b/test/CodeGen/elf-linker-options.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple i686---elf -emit-llvm %s -o - | FileCheck %s
+
+#pragma comment(lib, "alpha")
+
+// CHECK: !llvm.linker.options = !{[[NODE:![0-9]+]]}
+// CHECK: [[NODE]] = !{!"lib", !"alpha"}
+
diff --git a/test/CodeGen/emit-summary-index.c b/test/CodeGen/emit-summary-index.c
new file mode 100644
index 000000000000..612641020689
--- /dev/null
+++ b/test/CodeGen/emit-summary-index.c
@@ -0,0 +1,17 @@
+// ; Check that the -flto=thin option emits a ThinLTO summary
+// RUN: %clang_cc1 -flto=thin -emit-llvm-bc < %s | llvm-bcanalyzer -dump | FileCheck %s
+// CHECK: <GLOBALVAL_SUMMARY_BLOCK
+//
+// ; Check that we do not emit a summary for regular LTO on Apple platforms
+// RUN: %clang_cc1 -flto -triple x86_64-apple-darwin -emit-llvm-bc < %s | llvm-bcanalyzer -dump | FileCheck --check-prefix=LTO %s
+// LTO-NOT: GLOBALVAL_SUMMARY_BLOCK
+//
+// ; Check that we emit a summary for regular LTO by default elsewhere
+// RUN: %clang_cc1 -flto -triple x86_64-pc-linux-gnu -emit-llvm-bc < %s | llvm-bcanalyzer -dump | FileCheck --check-prefix=LTOINDEX %s
+// LTOINDEX: <FULL_LTO_GLOBALVAL_SUMMARY_BLOCK
+//
+// ; Simulate -save-temps and check that it works (!"ThinLTO" module flag not added multiple times)
+// RUN: %clang_cc1 -flto -triple x86_64-pc-linux-gnu -emit-llvm-bc -disable-llvm-passes < %s -o %t.bc
+// RUN: %clang_cc1 -flto -triple x86_64-pc-linux-gnu -emit-llvm-bc -x ir < %t.bc | llvm-bcanalyzer -dump | FileCheck --check-prefix=LTOINDEX %s
+
+int main() {}
diff --git a/test/CodeGen/exceptions-seh-finally.c b/test/CodeGen/exceptions-seh-finally.c
index 0f2123ba32bb..d863eb1bb47e 100644
--- a/test/CodeGen/exceptions-seh-finally.c
+++ b/test/CodeGen/exceptions-seh-finally.c
@@ -13,22 +13,22 @@ void basic_finally(void) {
   }
 }
 
-// CHECK-LABEL: define void @basic_finally()
+// CHECK-LABEL: define dso_local void @basic_finally()
 // CHECK: invoke void @might_crash()
 // CHECK:     to label %[[invoke_cont:[^ ]*]] unwind label %[[lpad:[^ ]*]]
 //
 // CHECK: [[invoke_cont]]
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK: call void @"\01?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
+// CHECK: call void @"?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
 // CHECK-NEXT: ret void
 //
 // CHECK: [[lpad]]
 // CHECK-NEXT: %[[pad:[^ ]*]] = cleanuppad
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK: call void @"\01?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 1, i8* %[[fp]])
+// CHECK: call void @"?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 1, i8* %[[fp]])
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
-// CHECK: define internal void @"\01?fin$0@0@basic_finally@@"({{.*}})
+// CHECK: define internal void @"?fin$0@0@basic_finally@@"({{.*}})
 // CHECK-SAME: [[finally_attrs:#[0-9]+]]
 // CHECK: call void @cleanup()
 
@@ -53,16 +53,16 @@ l:
   }
 }
 
-// CHECK-LABEL: define void @label_in_finally()
+// CHECK-LABEL: define dso_local void @label_in_finally()
 // CHECK: invoke void @might_crash()
 // CHECK:     to label %[[invoke_cont:[^ ]*]] unwind label %[[lpad:[^ ]*]]
 //
 // CHECK: [[invoke_cont]]
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK: call void @"\01?fin$0@0@label_in_finally@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
+// CHECK: call void @"?fin$0@0@label_in_finally@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
 // CHECK: ret void
 
-// CHECK: define internal void @"\01?fin$0@0@label_in_finally@@"({{.*}})
+// CHECK: define internal void @"?fin$0@0@label_in_finally@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: br label %[[l:[^ ]*]]
 //
@@ -81,22 +81,22 @@ void use_abnormal_termination(void) {
   }
 }
 
-// CHECK-LABEL: define void @use_abnormal_termination()
+// CHECK-LABEL: define dso_local void @use_abnormal_termination()
 // CHECK: invoke void @might_crash()
 // CHECK:     to label %[[invoke_cont:[^ ]*]] unwind label %[[lpad:[^ ]*]]
 //
 // CHECK: [[invoke_cont]]
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK: call void @"\01?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
+// CHECK: call void @"?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
 // CHECK: ret void
 //
 // CHECK: [[lpad]]
 // CHECK-NEXT: %[[pad:[^ ]*]] = cleanuppad
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK: call void @"\01?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} 1, i8* %[[fp]])
+// CHECK: call void @"?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} 1, i8* %[[fp]])
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
-// CHECK: define internal void @"\01?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} %[[abnormal:abnormal_termination]], i8* %frame_pointer)
+// CHECK: define internal void @"?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} %[[abnormal:abnormal_termination]], i8* %frame_pointer)
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: %[[abnormal_zext:[^ ]*]] = zext i8 %[[abnormal]] to i32
 // CHECK: store i32 %[[abnormal_zext]], i32* @crashed
@@ -110,11 +110,11 @@ void noreturn_noop_finally() {
   }
 }
 
-// CHECK-LABEL: define void @noreturn_noop_finally()
-// CHECK: call void @"\01?fin$0@0@noreturn_noop_finally@@"({{.*}})
+// CHECK-LABEL: define dso_local void @noreturn_noop_finally()
+// CHECK: call void @"?fin$0@0@noreturn_noop_finally@@"({{.*}})
 // CHECK: ret void
 
-// CHECK: define internal void @"\01?fin$0@0@noreturn_noop_finally@@"({{.*}})
+// CHECK: define internal void @"?fin$0@0@noreturn_noop_finally@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: call void @abort()
 // CHECK: unreachable
@@ -127,20 +127,20 @@ void noreturn_finally() {
   }
 }
 
-// CHECK-LABEL: define void @noreturn_finally()
+// CHECK-LABEL: define dso_local void @noreturn_finally()
 // CHECK: invoke void @might_crash()
 // CHECK:     to label %[[cont:[^ ]*]] unwind label %[[lpad:[^ ]*]]
 //
 // CHECK: [[cont]]
-// CHECK: call void @"\01?fin$0@0@noreturn_finally@@"({{.*}})
+// CHECK: call void @"?fin$0@0@noreturn_finally@@"({{.*}})
 // CHECK: ret void
 //
 // CHECK: [[lpad]]
 // CHECK-NEXT: %[[pad:[^ ]*]] = cleanuppad
-// CHECK: call void @"\01?fin$0@0@noreturn_finally@@"({{.*}})
+// CHECK: call void @"?fin$0@0@noreturn_finally@@"({{.*}})
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
-// CHECK: define internal void @"\01?fin$0@0@noreturn_finally@@"({{.*}})
+// CHECK: define internal void @"?fin$0@0@noreturn_finally@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: call void @abort()
 // CHECK: unreachable
@@ -151,11 +151,11 @@ int finally_with_return() {
   } __finally {
   }
 }
-// CHECK-LABEL: define i32 @finally_with_return()
-// CHECK: call void @"\01?fin$0@0@finally_with_return@@"({{.*}})
+// CHECK-LABEL: define dso_local i32 @finally_with_return()
+// CHECK: call void @"?fin$0@0@finally_with_return@@"({{.*}})
 // CHECK-NEXT: ret i32 42
 
-// CHECK: define internal void @"\01?fin$0@0@finally_with_return@@"({{.*}})
+// CHECK: define internal void @"?fin$0@0@finally_with_return@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK-NOT: br i1
 // CHECK-NOT: br label
@@ -173,24 +173,24 @@ int nested___finally___finally() {
   return 0;
 }
 
-// CHECK-LABEL: define i32 @nested___finally___finally
-// CHECK: invoke void @"\01?fin$1@0@nested___finally___finally@@"({{.*}})
+// CHECK-LABEL: define dso_local i32 @nested___finally___finally
+// CHECK: invoke void @"?fin$1@0@nested___finally___finally@@"({{.*}})
 // CHECK:          to label %[[outercont:[^ ]*]] unwind label %[[lpad:[^ ]*]]
 //
 // CHECK: [[outercont]]
-// CHECK: call void @"\01?fin$0@0@nested___finally___finally@@"({{.*}})
+// CHECK: call void @"?fin$0@0@nested___finally___finally@@"({{.*}})
 // CHECK-NEXT: ret i32 0
 //
 // CHECK: [[lpad]]
 // CHECK-NEXT: %[[pad:[^ ]*]] = cleanuppad
-// CHECK: call void @"\01?fin$0@0@nested___finally___finally@@"({{.*}})
+// CHECK: call void @"?fin$0@0@nested___finally___finally@@"({{.*}})
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
-// CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___finally@@"({{.*}})
+// CHECK-LABEL: define internal void @"?fin$0@0@nested___finally___finally@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: ret void
 
-// CHECK-LABEL: define internal void @"\01?fin$1@0@nested___finally___finally@@"({{.*}})
+// CHECK-LABEL: define internal void @"?fin$1@0@nested___finally___finally@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: unreachable
 
@@ -208,21 +208,21 @@ int nested___finally___finally_with_eh_edge() {
   }
   return 912;
 }
-// CHECK-LABEL: define i32 @nested___finally___finally_with_eh_edge
+// CHECK-LABEL: define dso_local i32 @nested___finally___finally_with_eh_edge
 // CHECK: invoke void @might_crash()
 // CHECK-NEXT: to label %[[invokecont:[^ ]*]] unwind label %[[lpad1:[^ ]*]]
 //
 // [[invokecont]]
-// CHECK: invoke void @"\01?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK: invoke void @"?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
 // CHECK-NEXT:       to label %[[outercont:[^ ]*]] unwind label %[[lpad2:[^ ]*]]
 //
 // CHECK: [[outercont]]
-// CHECK: call void @"\01?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK: call void @"?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
 // CHECK-NEXT: ret i32 912
 //
 // CHECK: [[lpad1]]
 // CHECK-NEXT: %[[innerpad:[^ ]*]] = cleanuppad
-// CHECK: invoke void @"\01?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK: invoke void @"?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
 // CHECK-NEXT:    label %[[innercleanupretbb:[^ ]*]] unwind label %[[lpad2:[^ ]*]]
 //
 // CHECK: [[innercleanupretbb]]
@@ -230,14 +230,14 @@ int nested___finally___finally_with_eh_edge() {
 //
 // CHECK: [[lpad2]]
 // CHECK-NEXT: %[[outerpad:[^ ]*]] = cleanuppad
-// CHECK: call void @"\01?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK: call void @"?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
 // CHECK-NEXT: cleanupret from %[[outerpad]] unwind to caller
 
-// CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK-LABEL: define internal void @"?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: ret void
 
-// CHECK-LABEL: define internal void @"\01?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK-LABEL: define internal void @"?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: unreachable
 
@@ -252,22 +252,34 @@ void finally_within_finally() {
   }
 }
 
-// CHECK-LABEL: define void @finally_within_finally(
+// CHECK-LABEL: define dso_local void @finally_within_finally(
 // CHECK: invoke void @might_crash(
 
-// CHECK: call void @"\01?fin$0@0@finally_within_finally@@"(
-// CHECK: call void @"\01?fin$0@0@finally_within_finally@@"({{.*}}) [ "funclet"(
+// CHECK: call void @"?fin$0@0@finally_within_finally@@"(
+// CHECK: call void @"?fin$0@0@finally_within_finally@@"({{.*}}) [ "funclet"(
 
-// CHECK-LABEL: define internal void @"\01?fin$0@0@finally_within_finally@@"({{[^)]*}})
+// CHECK-LABEL: define internal void @"?fin$0@0@finally_within_finally@@"({{[^)]*}})
 // CHECK-SAME: [[finally_attrs]]
 // CHECK: invoke void @might_crash(
 
-// CHECK: call void @"\01?fin$1@0@finally_within_finally@@"(
-// CHECK: call void @"\01?fin$1@0@finally_within_finally@@"({{.*}}) [ "funclet"(
+// CHECK: call void @"?fin$1@0@finally_within_finally@@"(
+// CHECK: call void @"?fin$1@0@finally_within_finally@@"({{.*}}) [ "funclet"(
 
-// CHECK-LABEL: define internal void @"\01?fin$1@0@finally_within_finally@@"({{[^)]*}})
+// CHECK-LABEL: define internal void @"?fin$1@0@finally_within_finally@@"({{[^)]*}})
 // CHECK-SAME: [[finally_attrs]]
 
+void cleanup_with_func(const char *);
+void finally_with_func() {
+  __try {
+    might_crash();
+  } __finally {
+    cleanup_with_func(__func__);
+  }
+}
+
+// CHECK-LABEL: define internal void @"?fin$0@0@finally_with_func@@"({{[^)]*}})
+// CHECK: call void @cleanup_with_func(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"??_C@_0BC@COAGBPGM@finally_with_func?$AA@", i32 0, i32 0))
+
 // Look for the absence of noinline. Enum attributes come first, so check that
 // a string attribute is the first to verify that no enum attributes are
 // present.
diff --git a/test/CodeGen/exceptions-seh-leave.c b/test/CodeGen/exceptions-seh-leave.c
index 087fadbcd7ab..e977e32e9fa4 100644
--- a/test/CodeGen/exceptions-seh-leave.c
+++ b/test/CodeGen/exceptions-seh-leave.c
@@ -17,7 +17,7 @@ int __leave_with___except_simple() {
   }
   return 1;
 }
-// CHECK-LABEL: define i32 @__leave_with___except_simple()
+// CHECK-LABEL: define dso_local i32 @__leave_with___except_simple()
 // CHECK: store i32 15, i32* %myres
 // CHECK-NEXT: br label %[[tryleave:[^ ]*]]
 // CHECK-NOT: store i32 23
@@ -37,7 +37,7 @@ int __leave_with___except() {
   }
   return 1;
 }
-// CHECK-LABEL: define i32 @__leave_with___except()
+// CHECK-LABEL: define dso_local i32 @__leave_with___except()
 // CHECK: invoke void @g()
 // CHECK-NEXT:       to label %[[cont:.*]] unwind label %{{.*}}
 // For __excepts, instead of an explicit __try.__leave label, we could use
@@ -69,13 +69,13 @@ int __leave_with___finally_simple() {
   }
   return 1;
 }
-// CHECK-LABEL: define i32 @__leave_with___finally_simple()
+// CHECK-LABEL: define dso_local i32 @__leave_with___finally_simple()
 // CHECK: store i32 15, i32* %myres
 // CHECK-NEXT: br label %[[tryleave:[^ ]*]]
 // CHECK-NOT: store i32 23
 // CHECK: [[tryleave]]
 // CHECK-NEXT: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: call void @"\01?fin$0@0@__leave_with___finally_simple@@"(i8 0, i8* %[[fp]])
+// CHECK-NEXT: call void @"?fin$0@0@__leave_with___finally_simple@@"(i8 0, i8* %[[fp]])
 
 // __finally block doesn't return, __finally.cont doesn't exist.
 int __leave_with___finally_noreturn() {
@@ -89,13 +89,13 @@ int __leave_with___finally_noreturn() {
   }
   return 1;
 }
-// CHECK-LABEL: define i32 @__leave_with___finally_noreturn()
+// CHECK-LABEL: define dso_local i32 @__leave_with___finally_noreturn()
 // CHECK: store i32 15, i32* %myres
 // CHECK-NEXT: br label %[[tryleave:[^ ]*]]
 // CHECK-NOT: store i32 23
 // CHECK: [[tryleave]]
 // CHECK-NEXT: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: call void @"\01?fin$0@0@__leave_with___finally_noreturn@@"(i8 0, i8* %[[fp]])
+// CHECK-NEXT: call void @"?fin$0@0@__leave_with___finally_noreturn@@"(i8 0, i8* %[[fp]])
 
 // The "normal" case.
 int __leave_with___finally() {
@@ -109,7 +109,7 @@ int __leave_with___finally() {
   }
   return 1;
 }
-// CHECK-LABEL: define i32 @__leave_with___finally()
+// CHECK-LABEL: define dso_local i32 @__leave_with___finally()
 // CHECK: invoke void @g()
 // CHECK-NEXT:       to label %[[cont:.*]] unwind label %{{.*}}
 // For __finally, there needs to be an explicit __try.__leave, because
@@ -119,7 +119,7 @@ int __leave_with___finally() {
 // CHECK-NOT: store i32 23
 // CHECK: [[tryleave]]
 // CHECK-NEXT: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: call void @"\01?fin$0@0@__leave_with___finally@@"(i8 0, i8* %[[fp]])
+// CHECK-NEXT: call void @"?fin$0@0@__leave_with___finally@@"(i8 0, i8* %[[fp]])
 
 
 //////////////////////////////////////////////////////////////////////////////
@@ -142,14 +142,14 @@ int nested___except___finally() {
   }
   return 1;
 }
-// CHECK-LABEL: define i32 @nested___except___finally()
+// CHECK-LABEL: define dso_local i32 @nested___except___finally()
 
 // CHECK-LABEL: invoke void @g()
 // CHECK-NEXT:       to label %[[g1_cont1:.*]] unwind label %[[g1_lpad:.*]]
 
 // CHECK: [[g1_cont1]]
 // CHECK-NEXT: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: invoke void @"\01?fin$0@0@nested___except___finally@@"(i8 0, i8* %[[fp]])
+// CHECK-NEXT: invoke void @"?fin$0@0@nested___except___finally@@"(i8 0, i8* %[[fp]])
 // CHECK-NEXT:       to label %[[fin_cont:.*]] unwind label %[[g2_lpad:.*]]
 
 // CHECK: [[fin_cont]]
@@ -159,7 +159,7 @@ int nested___except___finally() {
 // CHECK: [[g1_lpad]]
 // CHECK-NEXT: cleanuppad
 // CHECK-NEXT: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: invoke void @"\01?fin$0@0@nested___except___finally@@"(i8 1, i8* %[[fp]])
+// CHECK-NEXT: invoke void @"?fin$0@0@nested___except___finally@@"(i8 1, i8* %[[fp]])
 // CHECK-NEXT:       to label %[[g1_resume:.*]] unwind label %[[g2_lpad]]
 // CHECK: cleanupret {{.*}} unwind label %[[g2_lpad]]
 
@@ -171,7 +171,7 @@ int nested___except___finally() {
 // CHECK: [[trycont]]
 // CHECK-NEXT: ret i32 1
 
-// CHECK-LABEL: define internal void @"\01?fin$0@0@nested___except___finally@@"(i8 %abnormal_termination, i8* %frame_pointer)
+// CHECK-LABEL: define internal void @"?fin$0@0@nested___except___finally@@"(i8 %abnormal_termination, i8* %frame_pointer)
 // CHECK: call void @g()
 // CHECK: unreachable
 
@@ -194,7 +194,7 @@ int nested___except___except() {
   return 1;
 }
 // The order of basic blocks in the below doesn't matter.
-// CHECK-LABEL: define i32 @nested___except___except()
+// CHECK-LABEL: define dso_local i32 @nested___except___except()
 
 // CHECK-LABEL: invoke void @g()
 // CHECK-NEXT:       to label %[[g1_cont:.*]] unwind label %[[g1_lpad:.*]]
@@ -247,7 +247,7 @@ int nested___finally___except() {
   return 1;
 }
 // The order of basic blocks in the below doesn't matter.
-// CHECK-LABEL: define i32 @nested___finally___except()
+// CHECK-LABEL: define dso_local i32 @nested___finally___except()
 
 // CHECK-LABEL: invoke void @g()
 // CHECK-NEXT:       to label %[[g1_cont:.*]] unwind label %[[g1_lpad:.*]]
@@ -271,16 +271,16 @@ int nested___finally___except() {
 
 // CHECK: [[tryleave]]
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: call void @"\01?fin$0@0@nested___finally___except@@"(i8 0, i8* %[[fp]])
+// CHECK-NEXT: call void @"?fin$0@0@nested___finally___except@@"(i8 0, i8* %[[fp]])
 // CHECK-NEXT: ret i32 1
 
 // CHECK: [[g2_lpad]]
 // CHECK: cleanuppad
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: call void @"\01?fin$0@0@nested___finally___except@@"(i8 1, i8* %[[fp]])
+// CHECK-NEXT: call void @"?fin$0@0@nested___finally___except@@"(i8 1, i8* %[[fp]])
 // CHECK: cleanupret {{.*}} unwind to caller
 
-// CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___except@@"(i8 %abnormal_termination, i8* %frame_pointer)
+// CHECK-LABEL: define internal void @"?fin$0@0@nested___finally___except@@"(i8 %abnormal_termination, i8* %frame_pointer)
 // CHECK: ret void
 
 int nested___finally___finally() {
@@ -302,7 +302,7 @@ int nested___finally___finally() {
   return 1;
 }
 // The order of basic blocks in the below doesn't matter.
-// CHECK-LABEL: define i32 @nested___finally___finally()
+// CHECK-LABEL: define dso_local i32 @nested___finally___finally()
 
 // CHECK: invoke void @g()
 // CHECK-NEXT:       to label %[[g1_cont:.*]] unwind label %[[g1_lpad:.*]]
@@ -310,19 +310,19 @@ int nested___finally___finally() {
 // CHECK: [[g1_cont]]
 // CHECK: store i32 16, i32* %[[myres:[^ ]*]],
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: invoke void @"\01?fin$1@0@nested___finally___finally@@"(i8 0, i8* %[[fp]])
+// CHECK-NEXT: invoke void @"?fin$1@0@nested___finally___finally@@"(i8 0, i8* %[[fp]])
 // CHECK-NEXT:       to label %[[finally_cont:.*]] unwind label %[[g2_lpad:.*]]
 
 // CHECK: [[finally_cont]]
 // CHECK: store i32 51, i32* %[[myres]]
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: call void @"\01?fin$0@0@nested___finally___finally@@"(i8 0, i8* %[[fp]])
+// CHECK-NEXT: call void @"?fin$0@0@nested___finally___finally@@"(i8 0, i8* %[[fp]])
 // CHECK-NEXT: ret i32 1
 
 // CHECK: [[g1_lpad]]
 // CHECK-NEXT: %[[padtoken:[^ ]*]] = cleanuppad within none []
 // CHECK-NEXT: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: invoke void @"\01?fin$1@0@nested___finally___finally@@"(i8 1, i8* %[[fp]])
+// CHECK-NEXT: invoke void @"?fin$1@0@nested___finally___finally@@"(i8 1, i8* %[[fp]])
 // CHECK-NEXT:       to label %[[finally_cont2:.*]] unwind label %[[g2_lpad]]
 // CHECK: [[finally_cont2]]
 // CHECK: cleanupret from %[[padtoken]] unwind label %[[g2_lpad]]
@@ -330,12 +330,12 @@ int nested___finally___finally() {
 // CHECK: [[g2_lpad]]
 // CHECK-NEXT: %[[padtoken:[^ ]*]] = cleanuppad within none []
 // CHECK-NEXT: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK-NEXT: call void @"\01?fin$0@0@nested___finally___finally@@"(i8 1, i8* %[[fp]])
+// CHECK-NEXT: call void @"?fin$0@0@nested___finally___finally@@"(i8 1, i8* %[[fp]])
 // CHECK: cleanupret from %[[padtoken]] unwind to caller
 
-// CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___finally@@"(i8 %abnormal_termination, i8* %frame_pointer)
+// CHECK-LABEL: define internal void @"?fin$0@0@nested___finally___finally@@"(i8 %abnormal_termination, i8* %frame_pointer)
 // CHECK: ret void
 
-// CHECK-LABEL: define internal void @"\01?fin$1@0@nested___finally___finally@@"(i8 %abnormal_termination, i8* %frame_pointer)
+// CHECK-LABEL: define internal void @"?fin$1@0@nested___finally___finally@@"(i8 %abnormal_termination, i8* %frame_pointer)
 // CHECK: call void @g()
 // CHECK: unreachable
diff --git a/test/CodeGen/exceptions-seh.c b/test/CodeGen/exceptions-seh.c
index a0a1dbccd158..b38c7a2490c0 100644
--- a/test/CodeGen/exceptions-seh.c
+++ b/test/CodeGen/exceptions-seh.c
@@ -10,7 +10,7 @@
 void try_body(int numerator, int denominator, int *myres) {
   *myres = numerator / denominator;
 }
-// CHECK-LABEL: define void @try_body(i32 %numerator, i32 %denominator, i32* %myres)
+// CHECK-LABEL: define dso_local void @try_body(i32 %numerator, i32 %denominator, i32* %myres)
 // CHECK: sdiv i32
 // CHECK: store i32 %{{.*}}, i32*
 // CHECK: ret void
@@ -27,7 +27,7 @@ int safe_div(int numerator, int denominator, int *res) {
   return success;
 }
 
-// CHECK-LABEL: define i32 @safe_div(i32 %numerator, i32 %denominator, i32* %res)
+// CHECK-LABEL: define dso_local i32 @safe_div(i32 %numerator, i32 %denominator, i32* %res)
 // X64-SAME:      personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
 // X86-SAME:      personality i8* bitcast (i32 (...)* @_except_handler3 to i8*)
 // CHECK: invoke void @try_body(i32 %{{.*}}, i32 %{{.*}}, i32* %{{.*}}) #[[NOINLINE:[0-9]+]]
@@ -35,7 +35,7 @@ int safe_div(int numerator, int denominator, int *res) {
 //
 // CHECK: [[catchpad]]
 // X64: %[[padtoken:[^ ]*]] = catchpad within %{{[^ ]*}} [i8* null]
-// X86: %[[padtoken:[^ ]*]] = catchpad within %{{[^ ]*}} [i8* bitcast (i32 ()* @"\01?filt$0@0@safe_div@@" to i8*)]
+// X86: %[[padtoken:[^ ]*]] = catchpad within %{{[^ ]*}} [i8* bitcast (i32 ()* @"?filt$0@0@safe_div@@" to i8*)]
 // CHECK-NEXT: catchret from %[[padtoken]] to label %[[except:[^ ]*]]
 //
 // CHECK: [[except]]
@@ -46,7 +46,7 @@ int safe_div(int numerator, int denominator, int *res) {
 
 // 32-bit SEH needs this filter to save the exception code.
 //
-// X86-LABEL: define internal i32 @"\01?filt$0@0@safe_div@@"()
+// X86-LABEL: define internal i32 @"?filt$0@0@safe_div@@"()
 // X86: %[[ebp:[^ ]*]] = call i8* @llvm.frameaddress(i32 1)
 // X86: %[[fp:[^ ]*]] = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 (i32, i32, i32*)* @safe_div to i8*), i8* %[[ebp]])
 // X86: call i8* @llvm.localrecover(i8* bitcast (i32 (i32, i32, i32*)* @safe_div to i8*), i8* %[[fp]], i32 0)
@@ -57,9 +57,9 @@ int safe_div(int numerator, int denominator, int *res) {
 // X86: ret i32 1
 
 // Mingw uses msvcrt, so it can also use _except_handler3.
-// X86-GNU-LABEL: define i32 @safe_div(i32 %numerator, i32 %denominator, i32* %res)
+// X86-GNU-LABEL: define dso_local i32 @safe_div(i32 %numerator, i32 %denominator, i32* %res)
 // X86-GNU-SAME:      personality i8* bitcast (i32 (...)* @_except_handler3 to i8*)
-// X64-GNU-LABEL: define i32 @safe_div(i32 %numerator, i32 %denominator, i32* %res)
+// X64-GNU-LABEL: define dso_local i32 @safe_div(i32 %numerator, i32 %denominator, i32* %res)
 // X64-GNU-SAME:      personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
 
 void j(void);
@@ -74,7 +74,7 @@ int filter_expr_capture(void) {
   return r;
 }
 
-// CHECK-LABEL: define i32 @filter_expr_capture()
+// CHECK-LABEL: define dso_local i32 @filter_expr_capture()
 // X64-SAME: personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
 // X86-SAME: personality i8* bitcast (i32 (...)* @_except_handler3 to i8*)
 // X64: call void (...) @llvm.localescape(i32* %[[r:[^ ,]*]])
@@ -82,17 +82,17 @@ int filter_expr_capture(void) {
 // CHECK: store i32 42, i32* %[[r]]
 // CHECK: invoke void @j() #[[NOINLINE]]
 //
-// CHECK: catchpad within %{{[^ ]*}} [i8* bitcast (i32 ({{.*}})* @"\01?filt$0@0@filter_expr_capture@@" to i8*)]
+// CHECK: catchpad within %{{[^ ]*}} [i8* bitcast (i32 ({{.*}})* @"?filt$0@0@filter_expr_capture@@" to i8*)]
 // CHECK: store i32 13, i32* %[[r]]
 //
 // CHECK: %[[rv:[^ ]*]] = load i32, i32* %[[r]]
 // CHECK: ret i32 %[[rv]]
 
-// X64-LABEL: define internal i32 @"\01?filt$0@0@filter_expr_capture@@"(i8* %exception_pointers, i8* %frame_pointer)
+// X64-LABEL: define internal i32 @"?filt$0@0@filter_expr_capture@@"(i8* %exception_pointers, i8* %frame_pointer)
 // X64: %[[fp:[^ ]*]] = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @filter_expr_capture to i8*), i8* %frame_pointer)
 // X64: call i8* @llvm.localrecover(i8* bitcast (i32 ()* @filter_expr_capture to i8*), i8* %[[fp]], i32 0)
 //
-// X86-LABEL: define internal i32 @"\01?filt$0@0@filter_expr_capture@@"()
+// X86-LABEL: define internal i32 @"?filt$0@0@filter_expr_capture@@"()
 // X86: %[[ebp:[^ ]*]] = call i8* @llvm.frameaddress(i32 1)
 // X86: %[[fp:[^ ]*]] = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @filter_expr_capture to i8*), i8* %[[ebp]])
 // X86: call i8* @llvm.localrecover(i8* bitcast (i32 ()* @filter_expr_capture to i8*), i8* %[[fp]], i32 0)
@@ -114,7 +114,7 @@ int nested_try(void) {
   }
   return r;
 }
-// CHECK-LABEL: define i32 @nested_try()
+// CHECK-LABEL: define dso_local i32 @nested_try()
 // X64-SAME: personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
 // X86-SAME: personality i8* bitcast (i32 (...)* @_except_handler3 to i8*)
 // CHECK: store i32 42, i32* %[[r:[^ ,]*]]
@@ -128,7 +128,7 @@ int nested_try(void) {
 // CHECK: %[[cs_outer:[^ ]*]] = catchswitch within none [label %[[cpad_outer:[^ ]*]]] unwind to caller
 //
 // CHECK: [[cpad_outer]]
-// CHECK: catchpad within %{{[^ ]*}} [i8* bitcast (i32 ({{.*}})* @"\01?filt$0@0@nested_try@@" to i8*)]
+// CHECK: catchpad within %{{[^ ]*}} [i8* bitcast (i32 ({{.*}})* @"?filt$0@0@nested_try@@" to i8*)]
 // CHECK-NEXT: catchret {{.*}} to label %[[except_outer:[^ ]*]]
 //
 // CHECK: [[except_outer]]
@@ -140,7 +140,7 @@ int nested_try(void) {
 // CHECK: ret i32 %[[r_load]]
 //
 // CHECK: [[cpad_inner]]
-// CHECK: catchpad within %[[cs_inner]] [i8* bitcast (i32 ({{.*}})* @"\01?filt$1@0@nested_try@@" to i8*)]
+// CHECK: catchpad within %[[cs_inner]] [i8* bitcast (i32 ({{.*}})* @"?filt$1@0@nested_try@@" to i8*)]
 // CHECK-NEXT: catchret {{.*}} to label %[[except_inner:[^ ]*]]
 //
 // CHECK: [[except_inner]]
@@ -154,13 +154,13 @@ int nested_try(void) {
 // CHECK: store i32 0, i32* %[[r]]
 // CHECK: br label %[[inner_try_cont]]
 //
-// CHECK-LABEL: define internal i32 @"\01?filt$0@0@nested_try@@"({{.*}})
+// CHECK-LABEL: define internal i32 @"?filt$0@0@nested_try@@"({{.*}})
 // X86: call i8* @llvm.x86.seh.recoverfp({{.*}})
 // CHECK: load i32*, i32**
 // CHECK: load i32, i32*
 // CHECK: icmp eq i32 %{{.*}}, 456
 //
-// CHECK-LABEL: define internal i32 @"\01?filt$1@0@nested_try@@"({{.*}})
+// CHECK-LABEL: define internal i32 @"?filt$1@0@nested_try@@"({{.*}})
 // X86: call i8* @llvm.x86.seh.recoverfp({{.*}})
 // CHECK: load i32*, i32**
 // CHECK: load i32, i32*
@@ -174,7 +174,7 @@ int basic_finally(int g) {
   }
   return g;
 }
-// CHECK-LABEL: define i32 @basic_finally(i32 %g)
+// CHECK-LABEL: define dso_local i32 @basic_finally(i32 %g)
 // X64-SAME: personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
 // X86-SAME: personality i8* bitcast (i32 (...)* @_except_handler3 to i8*)
 // CHECK: %[[g_addr:[^ ]*]] = alloca i32, align 4
@@ -186,17 +186,17 @@ int basic_finally(int g) {
 //
 // CHECK: [[cont]]
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK: call void @"\01?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
+// CHECK: call void @"?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 0, i8* %[[fp]])
 // CHECK: load i32, i32* %[[g_addr]], align 4
 // CHECK: ret i32
 //
 // CHECK: [[cleanuppad]]
 // CHECK: %[[padtoken:[^ ]*]] = cleanuppad within none []
 // CHECK: %[[fp:[^ ]*]] = call i8* @llvm.localaddress()
-// CHECK: call void @"\01?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 1, i8* %[[fp]])
+// CHECK: call void @"?fin$0@0@basic_finally@@"({{i8( zeroext)?}} 1, i8* %[[fp]])
 // CHECK: cleanupret from %[[padtoken]] unwind to caller
 
-// CHECK: define internal void @"\01?fin$0@0@basic_finally@@"({{i8( zeroext)?}} %abnormal_termination, i8* %frame_pointer)
+// CHECK: define internal void @"?fin$0@0@basic_finally@@"({{i8( zeroext)?}} %abnormal_termination, i8* %frame_pointer)
 // CHECK:   call i8* @llvm.localrecover(i8* bitcast (i32 (i32)* @basic_finally to i8*), i8* %frame_pointer, i32 0)
 // CHECK:   load i32, i32* %{{.*}}, align 4
 // CHECK:   add nsw i32 %{{.*}}, 1
@@ -211,7 +211,7 @@ int except_return(void) {
     return 42;
   }
 }
-// CHECK-LABEL: define i32 @except_return()
+// CHECK-LABEL: define dso_local i32 @except_return()
 // CHECK: %[[tmp:[^ ]*]] = invoke i32 @returns_int()
 // CHECK:       to label %[[cont:[^ ]*]] unwind label %[[catchpad:[^ ]*]]
 //
@@ -240,7 +240,7 @@ void finally_capture_twice(int x) {
   }
 }
 //
-// CHECK-LABEL: define void @finally_capture_twice(
+// CHECK-LABEL: define dso_local void @finally_capture_twice(
 // CHECK:         [[X:%.*]] = alloca i32, align 4
 // CHECK:         call void (...) @llvm.localescape(i32* [[X]])
 // CHECK-NEXT:    store i32 {{.*}}, i32* [[X]], align 4
@@ -267,7 +267,7 @@ int exception_code_in_except(void) {
   }
 }
 
-// CHECK-LABEL: define i32 @exception_code_in_except()
+// CHECK-LABEL: define dso_local i32 @exception_code_in_except()
 // CHECK: %[[ret_slot:[^ ]*]] = alloca i32
 // CHECK: %[[code_slot:[^ ]*]] = alloca i32
 // CHECK: invoke void @try_body(i32 0, i32 0, i32* null)
diff --git a/test/CodeGen/ext-vector.c b/test/CodeGen/ext-vector.c
index beb58827ea13..faa6ede6a24d 100644
--- a/test/CodeGen/ext-vector.c
+++ b/test/CodeGen/ext-vector.c
@@ -5,10 +5,10 @@ typedef __attribute__(( ext_vector_type(2) )) float float2;
 typedef __attribute__(( ext_vector_type(4) )) int int4;
 typedef __attribute__(( ext_vector_type(4) )) unsigned int uint4;
 
-// CHECK: @foo = global <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
+// CHECK: @foo = {{(dso_local )?}}global <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
 float4 foo = (float4){ 1.0, 2.0, 3.0, 4.0 };
 
-// CHECK: @bar = constant <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 0x7FF0000000000000>
+// CHECK: @bar = {{(dso_local )?}}constant <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 0x7FF0000000000000>
 const float4 bar = (float4){ 1.0, 2.0, 3.0, __builtin_inff() };
 
 // CHECK: @test1
diff --git a/test/CodeGen/f16c-builtins.c b/test/CodeGen/f16c-builtins.c
index e4933e9f9bf3..ce14187cdad3 100644
--- a/test/CodeGen/f16c-builtins.c
+++ b/test/CodeGen/f16c-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +f16c -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 float test_cvtsh_ss(unsigned short a) {
   // CHECK-LABEL: test_cvtsh_ss
diff --git a/test/CodeGen/fentry.c b/test/CodeGen/fentry.c
index b9133184e4d4..43586c45633d 100644
--- a/test/CodeGen/fentry.c
+++ b/test/CodeGen/fentry.c
@@ -7,5 +7,12 @@ int foo(void) {
   return 0;
 }
 
-//CHECK: attributes #{{[0-9]+}} = { {{.*}}"fentry-call"="true"{{.*}} }
-//NOPG-NOT: attributes #{{[0-9]+}} = { {{.*}}"fentry-call"{{.*}} }
+int __attribute__((no_instrument_function)) no_instrument(void) {
+  return foo();
+}
+
+//CHECK: attributes #0 = { {{.*}}"fentry-call"="true"{{.*}} }
+//CHECK: attributes #1 = { {{.*}} }
+//CHECK-NOT: attributes #1 = { {{.*}}"fentry-call"="true"{{.*}} }
+//NOPG-NOT: attributes #0 = { {{.*}}"fentry-call"{{.*}} }
+//NOPG-NOT: attributes #1 = { {{.*}}"fentry-call"{{.*}} }
diff --git a/test/CodeGen/fixup-depth-overflow.c b/test/CodeGen/fixup-depth-overflow.c
index af452d05b643..1ae2a41b1c26 100644
--- a/test/CodeGen/fixup-depth-overflow.c
+++ b/test/CodeGen/fixup-depth-overflow.c
@@ -22,5 +22,5 @@ L1:
   return;
 }
 
-// CHECK-LABEL: define void @f
+// CHECK-LABEL: define {{(dso_local )?}}void @f
 // CHECK-NOT: cleanup
diff --git a/test/CodeGen/flip-dllimport.c b/test/CodeGen/flip-dllimport.c
new file mode 100644
index 000000000000..d20479d8f6e8
--- /dev/null
+++ b/test/CodeGen/flip-dllimport.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -emit-llvm -o - %s | FileCheck %s
+
+__declspec(dllimport) void f();
+void g() { f(); } // use it
+
+// CHECK: define dso_local dllexport void @f
+void f() { }
diff --git a/test/CodeGen/fma-builtins.c b/test/CodeGen/fma-builtins.c
index 6f792a78d84c..ff047f3d8374 100644
--- a/test/CodeGen/fma-builtins.c
+++ b/test/CodeGen/fma-builtins.c
@@ -5,81 +5,105 @@
 
 __m128 test_mm_fmadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmadd_ps
-  // CHECK: @llvm.x86.fma.vfmadd.ps
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fmadd_ps(a, b, c);
 }
 
 __m128d test_mm_fmadd_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmadd_pd
-  // CHECK: @llvm.x86.fma.vfmadd.pd
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fmadd_pd(a, b, c);
 }
 
 __m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmadd_ss
-  // CHECK: @llvm.x86.fma.vfmadd.ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_fmadd_ss(a, b, c);
 }
 
 __m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmadd_sd
-  // CHECK: @llvm.x86.fma.vfmadd.sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_fmadd_sd(a, b, c);
 }
 
 __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsub_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]])
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fmsub_ps(a, b, c);
 }
 
 __m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmsub_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]])
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fmsub_pd(a, b, c);
 }
 
 __m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsub_ss
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ss(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]])
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_fmsub_ss(a, b, c);
 }
 
 __m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmsub_sd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.sd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]])
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_fmsub_sd(a, b, c);
 }
 
 __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmadd_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> %{{.+}})
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fnmadd_ps(a, b, c);
 }
 
 __m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmadd_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> %{{.+}})
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fnmadd_pd(a, b, c);
 }
 
 __m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmadd_ss
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ss(<4 x float> %{{.+}}, <4 x float> [[NEG]], <4 x float> %{{.+}})
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_fnmadd_ss(a, b, c);
 }
 
 __m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmadd_sd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.sd(<2 x double> %{{.+}}, <2 x double> [[NEG]], <2 x double> %{{.+}})
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_fnmadd_sd(a, b, c);
 }
 
@@ -87,7 +111,7 @@ __m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmsub_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> [[NEG2]])
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_fnmsub_ps(a, b, c);
 }
 
@@ -95,7 +119,7 @@ __m128d test_mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmsub_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> [[NEG2]])
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_fnmsub_pd(a, b, c);
 }
 
@@ -103,7 +127,11 @@ __m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmsub_ss
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ss(<4 x float> %{{.+}}, <4 x float> [[NEG]], <4 x float> [[NEG2]])
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
   return _mm_fnmsub_ss(a, b, c);
 }
 
@@ -111,73 +139,87 @@ __m128d test_mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmsub_sd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.sd(<2 x double> %{{.+}}, <2 x double> [[NEG]], <2 x double> [[NEG2]])
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_fnmsub_sd(a, b, c);
 }
 
 __m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmaddsub_ps
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: shufflevector <4 x float> [[SUB]], <4 x float> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm_fmaddsub_ps(a, b, c);
 }
 
 __m128d test_mm_fmaddsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmaddsub_pd
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: shufflevector <2 x double> [[SUB]], <2 x double> [[ADD]], <2 x i32> <i32 0, i32 3>
   return _mm_fmaddsub_pd(a, b, c);
 }
 
 __m128 test_mm_fmsubadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsubadd_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: shufflevector <4 x float> [[ADD]], <4 x float> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm_fmsubadd_ps(a, b, c);
 }
 
 __m128d test_mm_fmsubadd_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmsubadd_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: shufflevector <2 x double> [[ADD]], <2 x double> [[SUB]], <2 x i32> <i32 0, i32 3>
   return _mm_fmsubadd_pd(a, b, c);
 }
 
 __m256 test_mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_fmadd_ps
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_fmadd_ps(a, b, c);
 }
 
 __m256d test_mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fmadd_pd
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_fmadd_pd(a, b, c);
 }
 
 __m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_fmsub_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> %{{.+}}, <8 x float> %{{.+}}, <8 x float> [[NEG]])
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_fmsub_ps(a, b, c);
 }
 
 __m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fmsub_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]])
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_fmsub_pd(a, b, c);
 }
 
 __m256 test_mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_fnmadd_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> %{{.+}})
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_fnmadd_ps(a, b, c);
 }
 
 __m256d test_mm256_fnmadd_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fnmadd_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> %{{.+}})
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_fnmadd_pd(a, b, c);
 }
 
@@ -185,7 +227,7 @@ __m256 test_mm256_fnmsub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_fnmsub_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: [[NEG2:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> [[NEG2]])
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_fnmsub_ps(a, b, c);
 }
 
@@ -193,32 +235,42 @@ __m256d test_mm256_fnmsub_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fnmsub_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> [[NEG2]])
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_fnmsub_pd(a, b, c);
 }
 
 __m256 test_mm256_fmaddsub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_fmaddsub_ps
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps.256
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: shufflevector <8 x float> [[SUB]], <8 x float> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   return _mm256_fmaddsub_ps(a, b, c);
 }
 
 __m256d test_mm256_fmaddsub_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fmaddsub_pd
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd.256
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[SUB]], <4 x double> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm256_fmaddsub_pd(a, b, c);
 }
 
 __m256 test_mm256_fmsubadd_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_fmsubadd_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.+}}, <8 x float> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: shufflevector <8 x float> [[ADD]], <8 x float> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   return _mm256_fmsubadd_ps(a, b, c);
 }
 
 __m256d test_mm256_fmsubadd_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_fmsubadd_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[ADD]], <4 x double> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm256_fmsubadd_pd(a, b, c);
 }
diff --git a/test/CodeGen/fma4-builtins.c b/test/CodeGen/fma4-builtins.c
index c848d4a751da..8078a6ed69b2 100644
--- a/test/CodeGen/fma4-builtins.c
+++ b/test/CodeGen/fma4-builtins.c
@@ -5,81 +5,105 @@
 
 __m128 test_mm_macc_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_macc_ps
-  // CHECK: @llvm.x86.fma.vfmadd.ps
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_macc_ps(a, b, c);
 }
 
 __m128d test_mm_macc_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_macc_pd
-  // CHECK: @llvm.x86.fma.vfmadd.pd
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_macc_pd(a, b, c);
 }
 
 __m128 test_mm_macc_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_macc_ss
-  // CHECK: @llvm.x86.fma4.vfmadd.ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
+  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
   return _mm_macc_ss(a, b, c);
 }
 
 __m128d test_mm_macc_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_macc_sd
-  // CHECK: @llvm.x86.fma4.vfmadd.sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
+  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
   return _mm_macc_sd(a, b, c);
 }
 
 __m128 test_mm_msub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_msub_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]])
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_msub_ps(a, b, c);
 }
 
 __m128d test_mm_msub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_msub_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]])
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_msub_pd(a, b, c);
 }
 
 __m128 test_mm_msub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_msub_ss
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma4.vfmadd.ss(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]])
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[C]])
+  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
   return _mm_msub_ss(a, b, c);
 }
 
 __m128d test_mm_msub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_msub_sd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma4.vfmadd.sd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]])
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[C]])
+  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
   return _mm_msub_sd(a, b, c);
 }
 
 __m128 test_mm_nmacc_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmacc_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> %{{.+}})
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_nmacc_ps(a, b, c);
 }
 
 __m128d test_mm_nmacc_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmacc_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> %{{.+}})
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_nmacc_pd(a, b, c);
 }
 
 __m128 test_mm_nmacc_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmacc_ss
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma4.vfmadd.ss(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> %{{.+}})
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float %{{.*}})
+  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
   return _mm_nmacc_ss(a, b, c);
 }
 
 __m128d test_mm_nmacc_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmacc_sd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma4.vfmadd.sd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> %{{.+}})
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double %{{.*}})
+  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
   return _mm_nmacc_sd(a, b, c);
 }
 
@@ -87,7 +111,7 @@ __m128 test_mm_nmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmsub_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> [[NEG2]])
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_nmsub_ps(a, b, c);
 }
 
@@ -95,7 +119,7 @@ __m128d test_mm_nmsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmsub_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> [[NEG2]])
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_nmsub_pd(a, b, c);
 }
 
@@ -103,7 +127,11 @@ __m128 test_mm_nmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmsub_ss
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma4.vfmadd.ss(<4 x float> [[NEG]], <4 x float> %{{.+}}, <4 x float> [[NEG2]])
+  // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
+  // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float [[C]])
+  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
   return _mm_nmsub_ss(a, b, c);
 }
 
@@ -111,73 +139,87 @@ __m128d test_mm_nmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmsub_sd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma4.vfmadd.sd(<2 x double> [[NEG]], <2 x double> %{{.+}}, <2 x double> [[NEG2]])
+  // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
+  // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double [[C]])
+  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
   return _mm_nmsub_sd(a, b, c);
 }
 
 __m128 test_mm_maddsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_maddsub_ps
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: shufflevector <4 x float> [[SUB]], <4 x float> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm_maddsub_ps(a, b, c);
 }
 
 __m128d test_mm_maddsub_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_maddsub_pd
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: shufflevector <2 x double> [[SUB]], <2 x double> [[ADD]], <2 x i32> <i32 0, i32 3>
   return _mm_maddsub_pd(a, b, c);
 }
 
 __m128 test_mm_msubadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_msubadd_ps
   // CHECK: [[NEG:%.+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.+}}, <4 x float> %{{.+}}, <4 x float> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: shufflevector <4 x float> [[ADD]], <4 x float> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm_msubadd_ps(a, b, c);
 }
 
 __m128d test_mm_msubadd_pd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_msubadd_pd
   // CHECK: [[NEG:%.+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.+}}, <2 x double> %{{.+}}, <2 x double> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: shufflevector <2 x double> [[ADD]], <2 x double> [[SUB]], <2 x i32> <i32 0, i32 3>
   return _mm_msubadd_pd(a, b, c);
 }
 
 __m256 test_mm256_macc_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_macc_ps
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_macc_ps(a, b, c);
 }
 
 __m256d test_mm256_macc_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_macc_pd
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_macc_pd(a, b, c);
 }
 
 __m256 test_mm256_msub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_msub_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> %{{.+}}, <8 x float> %{{.+}}, <8 x float> [[NEG]])
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_msub_ps(a, b, c);
 }
 
 __m256d test_mm256_msub_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_msub_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]])
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_msub_pd(a, b, c);
 }
 
 __m256 test_mm256_nmacc_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_nmacc_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> %{{.+}})
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_nmacc_ps(a, b, c);
 }
 
 __m256d test_mm256_nmacc_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_nmacc_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> %{{.+}})
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_nmacc_pd(a, b, c);
 }
 
@@ -185,7 +227,7 @@ __m256 test_mm256_nmsub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_nmsub_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: [[NEG2:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmadd.ps.256(<8 x float> [[NEG]], <8 x float> %{{.+}}, <8 x float> [[NEG2]])
+  // CHECK: call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_nmsub_ps(a, b, c);
 }
 
@@ -193,32 +235,42 @@ __m256d test_mm256_nmsub_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_nmsub_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
   // CHECK: [[NEG2:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmadd.pd.256(<4 x double> [[NEG]], <4 x double> %{{.+}}, <4 x double> [[NEG2]])
+  // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_nmsub_pd(a, b, c);
 }
 
 __m256 test_mm256_maddsub_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_maddsub_ps
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps.256
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: shufflevector <8 x float> [[SUB]], <8 x float> [[ADD]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   return _mm256_maddsub_ps(a, b, c);
 }
 
 __m256d test_mm256_maddsub_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_maddsub_pd
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd.256
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[SUB]], <4 x double> [[ADD]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm256_maddsub_pd(a, b, c);
 }
 
 __m256 test_mm256_msubadd_ps(__m256 a, __m256 b, __m256 c) {
   // CHECK-LABEL: test_mm256_msubadd_ps
   // CHECK: [[NEG:%.+]] = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.+}}, <8 x float> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: shufflevector <8 x float> [[ADD]], <8 x float> [[SUB]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   return _mm256_msubadd_ps(a, b, c);
 }
 
 __m256d test_mm256_msubadd_pd(__m256d a, __m256d b, __m256d c) {
   // CHECK-LABEL: test_mm256_msubadd_pd
   // CHECK: [[NEG:%.+]] = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.+}}
-  // CHECK: @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.+}}, <4 x double> %{{.+}}, <4 x double> [[NEG]])
+  // CHECK: [[SUB:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]
+  // CHECK: [[ADD:%.+]] = call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: shufflevector <4 x double> [[ADD]], <4 x double> [[SUB]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   return _mm256_msubadd_pd(a, b, c);
 }
diff --git a/test/CodeGen/fsgsbase-builtins.c b/test/CodeGen/fsgsbase-builtins.c
index 25f4b311a596..587ad84e3f51 100644
--- a/test/CodeGen/fsgsbase-builtins.c
+++ b/test/CodeGen/fsgsbase-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +fsgsbase -emit-llvm -o - | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 unsigned int test_readfsbase_u32()
 {
diff --git a/test/CodeGen/function-alignment.c b/test/CodeGen/function-alignment.c
new file mode 100644
index 000000000000..8564ceb02354
--- /dev/null
+++ b/test/CodeGen/function-alignment.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-NONE
+// RUN: %clang_cc1 -emit-llvm -function-alignment 4 %s -o - | FileCheck %s -check-prefix CHECK-16
+// RUN: %clang_cc1 -emit-llvm -function-alignment 5 %s -o - | FileCheck %s -check-prefix CHECK-32
+
+void f(void) {}
+void __attribute__((__aligned__(64))) g(void) {}
+
+// CHECK-NONE-NOT: define {{(dso_local )?}}void @f() #0 align
+// CHECK-NONE: define {{(dso_local )?}}void @g() #0 align 64
+
+// CHECK-16: define {{(dso_local )?}}void @f() #0 align 16
+// CHECK-16: define {{(dso_local )?}}void @g() #0 align 64
+
+// CHECK-32: define {{(dso_local )?}}void @f() #0 align 32
+// CHECK-32: define {{(dso_local )?}}void @g() #0 align 64
+
diff --git a/test/CodeGen/function-attributes.c b/test/CodeGen/function-attributes.c
index e14397440100..9174b8785d9e 100644
--- a/test/CodeGen/function-attributes.c
+++ b/test/CodeGen/function-attributes.c
@@ -71,7 +71,7 @@ void f15(void) {
 
 // PR5254
 // CHECK-LABEL: define void @f16
-// CHECK: [[ALIGN:#[0-9]+]]
+// CHECK: [[SR:#[0-9]+]]
 // CHECK: {
 void __attribute__((force_align_arg_pointer)) f16(void) {
 }
@@ -112,7 +112,7 @@ void f20(void) {
 // CHECK: attributes [[NUW]] = { nounwind optsize{{.*}} }
 // CHECK: attributes [[AI]] = { alwaysinline nounwind optsize{{.*}} }
 // CHECK: attributes [[NUW_OS_RN]] = { nounwind optsize readnone{{.*}} }
-// CHECK: attributes [[ALIGN]] = { nounwind optsize alignstack=16{{.*}} }
+// CHECK: attributes [[SR]] = { nounwind optsize{{.*}} "stackrealign"{{.*}} }
 // CHECK: attributes [[RT]] = { nounwind optsize returns_twice{{.*}} }
 // CHECK: attributes [[NR]] = { noreturn optsize }
 // CHECK: attributes [[NUW_RN]] = { nounwind optsize readnone }
diff --git a/test/CodeGen/function-min-vector-width.c b/test/CodeGen/function-min-vector-width.c
new file mode 100644
index 000000000000..fb22a4fce6d9
--- /dev/null
+++ b/test/CodeGen/function-min-vector-width.c
@@ -0,0 +1,7 @@
+// This test verifies that we produce min-legal-vector-width attributes
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+
+void __attribute((__min_vector_width__(128))) foo() {}
+
+// CHECK: "min-legal-vector-width"="128"
diff --git a/test/CodeGen/gfni-builtins.c b/test/CodeGen/gfni-builtins.c
index 95cfd4fcd5c7..0d10d7aeacd6 100644
--- a/test/CodeGen/gfni-builtins.c
+++ b/test/CodeGen/gfni-builtins.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -emit-llvm -o - | FileCheck %s --check-prefix SSE
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -DAVX -target-feature +gfni -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -DAVX512 -target-feature +gfni -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX,AVX512
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX,AVX512
 
 #include <immintrin.h>
 
@@ -22,7 +22,7 @@ __m128i test_mm_gf2p8mul_epi8(__m128i A, __m128i B) {
   return _mm_gf2p8mul_epi8(A, B);
 }
 
-#if defined(AVX) || defined(AVX512)
+#ifdef __AVX__
 __m256i test_mm256_gf2p8affineinv_epi64_epi8(__m256i A, __m256i B) {
   // AVX-LABEL: @test_mm256_gf2p8affineinv_epi64_epi8
   // AVX: @llvm.x86.vgf2p8affineinvqb.256
@@ -40,9 +40,9 @@ __m256i test_mm256_gf2p8mul_epi8(__m256i A, __m256i B) {
   // AVX: @llvm.x86.vgf2p8mulb.256
   return _mm256_gf2p8mul_epi8(A, B);
 }
-#endif // AVX
+#endif // __AVX__
 
-#ifdef AVX512
+#ifdef __AVX512BW__
 __m512i test_mm512_gf2p8affineinv_epi64_epi8(__m512i A, __m512i B) {
   // AVX512-LABEL: @test_mm512_gf2p8affineinv_epi64_epi8
   // AVX512: @llvm.x86.vgf2p8affineinvqb.512
@@ -179,4 +179,4 @@ __m128i test_mm_mask_gf2p8mul_epi8(__m128i S, __mmask16 U, __m128i A, __m128i B)
   // AVX512: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i8> {{.*}}
   return _mm_mask_gf2p8mul_epi8(S, U, A, B);
 }
-#endif // AVX512
+#endif // __AVX512BW__
diff --git a/test/CodeGen/hexagon-brev-ld-ptr-incdec.c b/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
new file mode 100644
index 000000000000..67c6c347e94d
--- /dev/null
+++ b/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
@@ -0,0 +1,52 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -emit-llvm -O2 -o - -triple hexagon-unknown-elf %s | FileCheck %s
+
+// The return value should return the value in A[1].
+// Check that the HexagonBuiltinExpr doesn't evaluate &(*ptr++) twice. If so,
+// the return value will be the value in A[2]
+// CHECK: @brev_ptr_inc
+// CHECK-DAG: llvm.hexagon.L2.loadri.pbr
+// CHECK-DAG: getelementptr{{.*}}i32 1
+// CHECK-NOT: getelementptr{{.*}}i32 2
+// CHECK-NOT: getelementptr{{.*}}i32 1
+int brev_ptr_inc(int A[], int B[]) {
+  int *p0 = &B[0];
+  int *p1 = &A[0];
+  __builtin_brev_ldw(p0, &*p1++, 8);
+  return (*p1);
+}
+
+// The return value should return the value in A[0].
+// CHECK: @brev_ptr_dec
+// CHECK: llvm.hexagon.L2.loadri.pbr
+// CHECK: [[RET:%[0-9]+]] = load{{.*}}%A
+// CHECK: ret{{.*}}[[RET]]
+int brev_ptr_dec(int A[], int B[]) {
+  int *p0 = &B[0];
+  int *p1 = &A[1];
+  __builtin_brev_ldw(p0, &*p1--, 8);
+  return (*p1);
+}
+
+// The store in bitcode needs to be of width correspondng to 16-bit.
+// CHECK: @brev_ptr_half
+// CHECK: llvm.hexagon.L2.loadrh.pbr
+// CHECK: store{{.*}}i16{{.*}}i16*
+short int brev_ptr_half(short int A[], short int B[]) {
+  short int *p0 = &B[0];
+  short int *p1 = &A[0];
+  __builtin_brev_ldh(p0, &*p1++, 8);
+  return (*p1);
+}
+
+// The store in bitcode needs to be of width correspondng to 8-bit.
+// CHECK: @brev_ptr_byte
+// CHECK: llvm.hexagon.L2.loadrub.pbr
+// CHECK: store{{.*}}i8{{.*}}i8*
+unsigned char brev_ptr_byte(unsigned char A[], unsigned char B[]) {
+  unsigned char *p0 = &B[0];
+  unsigned char *p1 = &A[0];
+  __builtin_brev_ldub(p0, &*p1++, 8);
+  return (*p1);
+}
+
diff --git a/test/CodeGen/hexagon-brev-store-elm.c b/test/CodeGen/hexagon-brev-store-elm.c
new file mode 100644
index 000000000000..f593419e63e6
--- /dev/null
+++ b/test/CodeGen/hexagon-brev-store-elm.c
@@ -0,0 +1,46 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -emit-llvm -O2 -o - -triple hexagon-unknown-elf %s | FileCheck %s
+// This unit test validates that the store to "dst" variable needs to be eliminated.
+
+// CHECK: @brev_store_elimination_test1
+// CHECK: llvm.hexagon.L2.loadri.pbr
+// CHECK-NOT: store
+
+int *brev_store_elimination_test1(int *ptr, int mod) {
+  int dst = 100;
+  return __builtin_brev_ldw(ptr, &dst, mod);
+}
+
+// CHECK: @brev_store_elimination_test2
+// CHECK: llvm.hexagon.L2.loadri.pbr
+// CHECK-NOT: store
+extern int add(int a);
+int brev_store_elimination_test2(int *ptr, int mod) {
+  int dst = 100;
+  __builtin_brev_ldw(ptr, &dst, mod);
+  return add(dst);
+}
+
+// CHECK: @brev_store_elimination_test3
+// CHECK: llvm.hexagon.L2.loadri.pbr
+// CHECK-NOT: store
+int brev_store_elimination_test3(int *ptr, int mod, int inc) {
+  int dst = 100;
+  for (int i = 0; i < inc; ++i) {
+    __builtin_brev_ldw(ptr, &dst, mod);
+    dst = add(dst);
+  }
+  return dst;
+}
+
+// brev_store_elimination_test4 validates the fact that we are not deleting the
+// stores if the value is passed by reference later.
+// CHECK: @brev_store_elimination_test4
+// CHECK: llvm.hexagon.L2.loadri.pbr
+// CHECK: store
+extern int sub(int *a);
+int brev_store_elimination_test4(int *ptr, int mod) {
+  int dst = 100;
+  __builtin_brev_ldw(ptr, &dst, mod);
+  return sub(&dst);
+}
diff --git a/test/CodeGen/hexagon-check-builtins.c b/test/CodeGen/hexagon-check-builtins.c
new file mode 100644
index 000000000000..a9294362f6ba
--- /dev/null
+++ b/test/CodeGen/hexagon-check-builtins.c
@@ -0,0 +1,30 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -fsyntax-only -triple hexagon-unknown-elf -verify %s
+
+int foo(int x) {
+  // expected-error-re@+2 {{argument value {{.*}} is outside the valid range}}
+  // expected-error-re@+1 {{argument value {{.*}} is outside the valid range}}
+  return __builtin_HEXAGON_S4_extract(x, 33, -1) +
+  // expected-error-re@+1 {{argument value {{.*}} is outside the valid range}}
+         __builtin_HEXAGON_S4_extract(x, 3, 91) +
+  // expected-error-re@+2 {{argument value {{.*}} is outside the valid range}}
+  // expected-error-re@+1 {{argument value {{.*}} is outside the valid range}}
+         __builtin_HEXAGON_S4_extract(x, -1, 35) +
+         __builtin_HEXAGON_S4_extract(x, 0, 31) +
+         __builtin_HEXAGON_S4_extract(x, 31, 0);
+}
+
+int bar(void *p, void *q, int x) {
+  // expected-error@+1 {{argument should be a multiple of 4}}
+  return __builtin_HEXAGON_L2_loadri_pci(p, -1, x, q) +
+  // expected-error-re@+2 {{argument value {{.*}} is outside the valid range}}
+  // expected-error@+1 {{argument should be a multiple of 4}}
+         __builtin_HEXAGON_L2_loadri_pci(p, -99, x, q) +
+  // expected-error-re@+1 {{argument value {{.*}} is outside the valid range}}
+         __builtin_HEXAGON_L2_loadri_pci(p, -132, x, q) +
+         __builtin_HEXAGON_L2_loadri_pci(p, 28, x, q) +
+  // expected-error-re@+2 {{argument value {{.*}} is outside the valid range}}
+  // expected-error@+1 {{argument should be a multiple of 4}}
+         __builtin_HEXAGON_L2_loadri_pci(p, 29, x, q);
+}
+
diff --git a/test/CodeGen/init.c b/test/CodeGen/init.c
index 5d086723cc0e..71aba39b1244 100644
--- a/test/CodeGen/init.c
+++ b/test/CodeGen/init.c
@@ -8,8 +8,9 @@ unsigned v2[2][3] = {[0 ... 1][0 ... 1] = 2222, 3333};
 // CHECK-DAG: %struct.M = type { [2 x %struct.I] }
 // CHECK-DAG: %struct.I = type { [3 x i32] }
 
-// CHECK: [1 x %struct.M] [%struct.M { [2 x %struct.I] [%struct.I { [3 x i32] [i32 4, i32 4, i32 0] }, %struct.I { [3 x i32] [i32 4, i32 4, i32 5] }] }],
-// CHECK: [2 x [3 x i32]] {{[[][[]}}3 x i32] [i32 2222, i32 2222, i32 0], [3 x i32] [i32 2222, i32 2222, i32 3333]],
+// CHECK-DAG: [1 x %struct.M] [%struct.M { [2 x %struct.I] [%struct.I { [3 x i32] [i32 4, i32 4, i32 0] }, %struct.I { [3 x i32] [i32 4, i32 4, i32 5] }] }],
+// CHECK-DAG: [2 x [3 x i32]] {{[[][[]}}3 x i32] [i32 2222, i32 2222, i32 0], [3 x i32] [i32 2222, i32 2222, i32 3333]],
+// CHECK-DAG: [[INIT14:.*]] = private global [16 x i32] [i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 0, i32 0, i32 0, i32 0], align 4
 
 void f1() {
   // Scalars in braces.
@@ -33,8 +34,8 @@ void f3() {
 }
 
 // Constants
-// CHECK: @g3 = constant i32 10
-// CHECK: @f4.g4 = internal constant i32 12
+// CHECK-DAG: @g3 = constant i32 10
+// CHECK-DAG: @f4.g4 = internal constant i32 12
 const int g3 = 10;
 int f4() {
   static const int g4 = 12;
@@ -61,7 +62,7 @@ void f6() {
 
 
 
-// CHECK: @test7 = global{{.*}}{ i32 0, [4 x i8] c"bar\00" }
+// CHECK-DAG: @test7 = global{{.*}}{ i32 0, [4 x i8] c"bar\00" }
 // PR8217
 struct a7 {
   int  b;
@@ -71,29 +72,54 @@ struct a7 {
 struct a7 test7 = { .b = 0, .v = "bar" };
 
 
+// CHECK-DAG: @huge_array = global {{.*}} <{ i32 1, i32 0, i32 2, i32 0, i32 3, [999999995 x i32] zeroinitializer }>
+int huge_array[1000000000] = {1, 0, 2, 0, 3, 0, 0, 0};
+
+// CHECK-DAG: @huge_struct = global {{.*}} { i32 1, <{ i32, [999999999 x i32] }> <{ i32 2, [999999999 x i32] zeroinitializer }> }
+struct Huge {
+  int a;
+  int arr[1000 * 1000 * 1000];
+} huge_struct = {1, {2, 0, 0, 0}};
+
+// CHECK-DAG: @large_array_with_zeroes = constant <{ [21 x i8], [979 x i8] }> <{ [21 x i8] c"abc\01\02\03xyzzy\00\00\00\00\00\00\00\00\00q", [979 x i8] zeroinitializer }>
+const char large_array_with_zeroes[1000] = {
+  'a', 'b', 'c', 1, 2, 3, 'x', 'y', 'z', 'z', 'y', [20] = 'q'
+};
+
+char global;
+
+// CHECK-DAG: @large_array_with_zeroes_2 = global <{ [10 x i8*], [90 x i8*] }> <{ [10 x i8*] [i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* @global], [90 x i8*] zeroinitializer }>
+const void *large_array_with_zeroes_2[100] = {
+  [9] = &global
+};
+// CHECK-DAG: @large_array_with_zeroes_3 = global <{ [10 x i8*], [990 x i8*] }> <{ [10 x i8*] [i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* @global], [990 x i8*] zeroinitializer }>
+const void *large_array_with_zeroes_3[1000] = {
+  [9] = &global
+};
+
 // PR279 comment #3
 char test8(int X) {
   char str[100000] = "abc"; // tail should be memset.
   return str[X];
-// CHECK: @test8(
-// CHECK: call void @llvm.memset
-// CHECK: store i8 97
-// CHECK: store i8 98
-// CHECK: store i8 99
-// CHECK-NOT: getelementptr
-// CHECK: load
+  // CHECK-LABEL: @test8(
+  // CHECK: call void @llvm.memset
+  // CHECK: store i8 97, i8* %{{[0-9]*}}, align 1
+  // CHECK: store i8 98, i8* %{{[0-9]*}}, align 1
+  // CHECK: store i8 99, i8* %{{[0-9]*}}, align 1
+  // CHECK-NOT: getelementptr
+  // CHECK: load
 }
 
 void bar(void*);
 
 // PR279
-int test9(int X) {
+void test9(int X) {
   int Arr[100] = { X };     // Should use memset
   bar(Arr);
-// CHECK: @test9
-// CHECK: call void @llvm.memset
-// CHECK-NOT: store i32 0
-// CHECK: call void @bar
+  // CHECK-LABEL: @test9(
+  // CHECK: call void @llvm.memset
+  // CHECK-NOT: store i32 0
+  // CHECK: call void @bar
 }
 
 struct a {
@@ -104,16 +130,82 @@ struct b {
   struct a a,b,c,d,e,f,g;
 };
 
-int test10(int X) {
+void test10(int X) {
   struct b S = { .a.a = X, .d.e = X, .f.e = 0, .f.f = 0, .f.p = 0 };
   bar(&S);
 
-  // CHECK: @test10
+  // CHECK-LABEL: @test10(
   // CHECK: call void @llvm.memset
   // CHECK-NOT: store i32 0
   // CHECK: call void @bar
 }
 
+void nonzeroMemseti8() {
+  char arr[33] = { 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, };
+  // CHECK-LABEL: @nonzeroMemseti8(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 42, i32 33, i1 false)
+}
+
+void nonzeroMemseti16() {
+  unsigned short arr[17] = { 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, 0x4242, };
+  // CHECK-LABEL: @nonzeroMemseti16(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 66, i32 34, i1 false)
+}
+
+void nonzeroMemseti32() {
+  unsigned arr[9] = { 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, };
+  // CHECK-LABEL: @nonzeroMemseti32(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 -16, i32 36, i1 false)
+}
+
+void nonzeroMemseti64() {
+  unsigned long long arr[7] = { 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA,  0xAAAAAAAAAAAAAAAA,  0xAAAAAAAAAAAAAAAA,  };
+  // CHECK-LABEL: @nonzeroMemseti64(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 -86, i32 56, i1 false)
+}
+
+void nonzeroMemsetf32() {
+  float arr[9] = { 0x1.cacacap+75, 0x1.cacacap+75, 0x1.cacacap+75, 0x1.cacacap+75, 0x1.cacacap+75, 0x1.cacacap+75, 0x1.cacacap+75, 0x1.cacacap+75, 0x1.cacacap+75, };
+  // CHECK-LABEL: @nonzeroMemsetf32(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 101, i32 36, i1 false)
+}
+
+void nonzeroMemsetf64() {
+  double arr[7] = { 0x1.4444444444444p+69, 0x1.4444444444444p+69, 0x1.4444444444444p+69, 0x1.4444444444444p+69, 0x1.4444444444444p+69, 0x1.4444444444444p+69, 0x1.4444444444444p+69, };
+  // CHECK-LABEL: @nonzeroMemsetf64(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 68, i32 56, i1 false)
+}
+
+void nonzeroPaddedUnionMemset() {
+  union U { char c; int i; };
+  union U arr[9] = { 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, };
+  // CHECK-LABEL: @nonzeroPaddedUnionMemset(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 -16, i32 36, i1 false)
+}
+
+void nonzeroNestedMemset() {
+  union U { char c; int i; };
+  struct S { union U u; short i; };
+  struct S arr[5] = { { {0xF0}, 0xF0F0 }, { {0xF0}, 0xF0F0 }, { {0xF0}, 0xF0F0 }, { {0xF0}, 0xF0F0 }, { {0xF0}, 0xF0F0 }, };
+  // CHECK-LABEL: @nonzeroNestedMemset(
+  // CHECK-NOT: store
+  // CHECK-NOT: memcpy
+  // CHECK: call void @llvm.memset.p0i8.i32(i8* {{.*}}, i8 -16, i32 40, i1 false)
+}
 
 // PR9257
 struct test11S {
@@ -121,11 +213,11 @@ struct test11S {
 };
 void test11(struct test11S *P) {
   *P = (struct test11S) { .A = { [0 ... 3] = 4 } };
-  // CHECK: @test11
-  // CHECK: store i32 4
-  // CHECK: store i32 4
-  // CHECK: store i32 4
-  // CHECK: store i32 4
+  // CHECK-LABEL: @test11(
+  // CHECK: store i32 4, i32* %{{.*}}, align 4
+  // CHECK: store i32 4, i32* %{{.*}}, align 4
+  // CHECK: store i32 4, i32* %{{.*}}, align 4
+  // CHECK: store i32 4, i32* %{{.*}}, align 4
   // CHECK: ret void
 }
 
@@ -140,14 +232,26 @@ struct test12 {
 void test13(int x) {
   struct X { int a; int b : 10; int c; };
   struct X y = {.c = x};
-  // CHECK: @test13
+  // CHECK-LABEL: @test13(
   // CHECK: and i16 {{.*}}, -1024
 }
 
-// CHECK-LABEL: @PR20473
+// CHECK-LABEL: @PR20473(
 void PR20473() {
   // CHECK: memcpy{{.*}}getelementptr inbounds ([2 x i8], [2 x i8]* @
   bar((char[2]) {""});
   // CHECK: memcpy{{.*}}getelementptr inbounds ([3 x i8], [3 x i8]* @
   bar((char[3]) {""});
 }
+
+// Test that we initialize large member arrays by copying from a global and not
+// with a series of stores.
+struct S14 { int a[16]; };
+
+void test14(struct S14 *s14) {
+  // CHECK-LABEL: @test14(
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 {{.*}}, i8* align 4 {{.*}} [[INIT14]] {{.*}}, i32 64, i1 false)
+  // CHECK-NOT: store
+  // CHECK: ret void
+  *s14 = (struct S14) { { [5 ... 11] = 17 } };
+}
diff --git a/test/CodeGen/inline.c b/test/CodeGen/inline.c
index e17b251df000..ea6e9d76494b 100644
--- a/test/CodeGen/inline.c
+++ b/test/CodeGen/inline.c
@@ -61,13 +61,13 @@
 // RUN: %clang_cc1 %s -triple i386-pc-win32 -O1 -disable-llvm-passes -emit-llvm -o - -std=c99 | FileCheck %s --check-prefix=CHECK4
 // RUN: %clang_cc1 %s -triple i386-pc-win32 -fexperimental-new-pass-manager -O1 -disable-llvm-passes -emit-llvm -o - -std=c99 | FileCheck %s --check-prefix=CHECK4
 // CHECK4-NOT: define weak_odr void @_Exit(
-// CHECK4-LABEL: define weak_odr i32 @ei()
-// CHECK4-LABEL: define i32 @bar()
+// CHECK4-LABEL: define weak_odr dso_local i32 @ei()
+// CHECK4-LABEL: define dso_local i32 @bar()
 // CHECK4-NOT: unreferenced1
-// CHECK4-LABEL: define weak_odr void @unreferenced2()
-// CHECK4-LABEL: define void @gnu_inline()
-// CHECK4-LABEL: define linkonce_odr i32 @foo()
-// CHECK4-LABEL: define available_externally void @gnu_ei_inline()
+// CHECK4-LABEL: define weak_odr dso_local void @unreferenced2()
+// CHECK4-LABEL: define dso_local void @gnu_inline()
+// CHECK4-LABEL: define linkonce_odr dso_local i32 @foo()
+// CHECK4-LABEL: define available_externally dso_local void @gnu_ei_inline()
 
 __attribute__((noreturn)) void __cdecl _exit(int _Code);
 __inline void __cdecl _Exit(int status) { _exit(status); }
diff --git a/test/CodeGen/invpcid.c b/test/CodeGen/invpcid.c
new file mode 100644
index 000000000000..f472cd995371
--- /dev/null
+++ b/test/CodeGen/invpcid.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +invpcid -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +invpcid -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+void test_invpcid(uint32_t type, void *descriptor) {
+  //CHECK-LABEL: @test_invpcid
+  //CHECK: call void @llvm.x86.invpcid(i32 %{{.*}}, i8* %{{.*}})
+  _invpcid(type, descriptor);
+}
diff --git a/test/CodeGen/kr-func-promote.c b/test/CodeGen/kr-func-promote.c
index 8e55dc91edf4..122e6435d0b5 100644
--- a/test/CodeGen/kr-func-promote.c
+++ b/test/CodeGen/kr-func-promote.c
@@ -1,6 +1,12 @@
 // RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s
-// CHECK: i32 @a(i32)
 
+// CHECK: i32 @a(i32
 int a();
 int a(x) short x; {return x;}
 
+// CHECK: void @b(double
+// CHECK: %[[ADDR:.*]] = alloca float, align 4
+// CHECK: %[[TRUNC:.*]] = fptrunc double %0 to float
+// CHECK: store float %[[TRUNC]], float* %[[ADDR]], align 4
+void b();
+void b(f) float f; {}
diff --git a/test/CodeGen/le32-vaarg.c b/test/CodeGen/le32-vaarg.c
index c02af27691f2..7e1dd8a27ae6 100644
--- a/test/CodeGen/le32-vaarg.c
+++ b/test/CodeGen/le32-vaarg.c
@@ -23,7 +23,7 @@ void get_struct(va_list *args) {
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo{{$}}
 // CHECK: store %struct.Foo [[RESULT]], %struct.Foo* [[LOC:%[a-z_0-9]+]]
 // CHECK: [[LOC2:%[a-z_0-9]+]] = bitcast {{.*}} [[LOC]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[LOC2]]
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* align {{[0-9]+}} [[LOC2]]
 
 void skip_struct(va_list *args) {
   va_arg(*args, struct Foo);
diff --git a/test/CodeGen/libcalls-fno-builtin.c b/test/CodeGen/libcalls-fno-builtin.c
index bc6a4302b01c..6fac6ee3390b 100644
--- a/test/CodeGen/libcalls-fno-builtin.c
+++ b/test/CodeGen/libcalls-fno-builtin.c
@@ -1,14 +1,19 @@
-// RUN: %clang_cc1 -S -O3 -fno-builtin -o - %s | FileCheck %s
-// RUN: %clang_cc1 -S -O3 -fno-builtin-ceil -fno-builtin-copysign -fno-builtin-cos \
+// RUN: %clang_cc1 -S -emit-llvm -fno-builtin -o - %s | FileCheck %s
+// RUN: %clang_cc1 -S -emit-llvm -fno-builtin-ceil -fno-builtin-copysign -fno-builtin-cos \
 // RUN:  -fno-builtin-fabs -fno-builtin-floor -fno-builtin-strcat -fno-builtin-strncat \
 // RUN:  -fno-builtin-strchr -fno-builtin-strrchr -fno-builtin-strcmp -fno-builtin-strncmp \
 // RUN:  -fno-builtin-strcpy -fno-builtin-stpcpy -fno-builtin-strncpy -fno-builtin-strlen \
 // RUN:  -fno-builtin-strpbrk -fno-builtin-strspn -fno-builtin-strtod -fno-builtin-strtof \
 // RUN:  -fno-builtin-strtold -fno-builtin-strtol -fno-builtin-strtoll -fno-builtin-strtoul \
-// RUN:  -fno-builtin-strtoull -o - %s | FileCheck %s
+// RUN:  -fno-builtin-strtoull -fno-builtin-fread -fno-builtin-fwrite -fno-builtin-fopen \
+// RUN:  -o - %s | FileCheck %s
+// RUN: %clang_cc1 -S -O3 -fno-builtin -o - %s | FileCheck --check-prefix=ASM %s
+// RUN: %clang_cc1 -S -O3 -fno-builtin-ceil -o - %s | FileCheck --check-prefix=ASM-INDIV %s
+
 // rdar://10551066
 
 typedef __SIZE_TYPE__ size_t;
+typedef struct FILE FILE;
 
 double ceil(double x);
 double copysign(double,double);
@@ -34,99 +39,125 @@ long int strtol(const char *nptr, char **endptr, int base);
 long long int strtoll(const char *nptr, char **endptr, int base);
 unsigned long int strtoul(const char *nptr, char **endptr, int base);
 unsigned long long int strtoull(const char *nptr, char **endptr, int base);
+size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
+size_t fwrite(const void *ptr, size_t size, size_t nmemb,
+              FILE *stream);
+FILE *fopen(const char *path, const char *mode);
 
 double t1(double x) { return ceil(x); }
-// CHECK: t1
-// CHECK: ceil
+// CHECK-LABEL: t1
+// CHECK: call{{.*}}@ceil{{.*}} [[ATTR:#[0-9]+]]
+
+// ASM: t1
+// ASM: ceil
+
+// ASM-INDIV: t1
+// ASM-INDIV: ceil
 
 double t2(double x, double y) { return copysign(x,y); }
-// CHECK: t2
-// CHECK: copysign
+// CHECK-LABEL: t2
+// CHECK: call{{.*}}@copysign{{.*}} [[ATTR]]
 
 double t3(double x) { return cos(x); }
-// CHECK: t3
-// CHECK: cos
+// CHECK-LABEL: t3
+// CHECK: call{{.*}}@cos{{.*}} [[ATTR]]
 
 double t4(double x) { return fabs(x); }
-// CHECK: t4
-// CHECK: fabs
+// CHECK-LABEL: t4
+// CHECK: call{{.*}}@fabs{{.*}} [[ATTR]]
 
 double t5(double x) { return floor(x); }
-// CHECK: t5
-// CHECK: floor
+// CHECK-LABEL: t5
+// CHECK: call{{.*}}@floor{{.*}} [[ATTR]]
 
 char *t6(char *x) { return strcat(x, ""); }
-// CHECK: t6
-// CHECK: strcat
+// CHECK-LABEL: t6
+// CHECK: call{{.*}}@strcat{{.*}} [[ATTR]]
 
 char *t7(char *x) { return strncat(x, "", 1); }
-// CHECK: t7
-// CHECK: strncat
+// CHECK-LABEL: t7
+// CHECK: call{{.*}}@strncat{{.*}} [[ATTR]]
 
 char *t8(void) { return strchr("hello, world", 'w'); }
-// CHECK: t8
-// CHECK: strchr
+// CHECK-LABEL: t8
+// CHECK: call{{.*}}@strchr{{.*}} [[ATTR]]
 
 char *t9(void) { return strrchr("hello, world", 'w'); }
-// CHECK: t9
-// CHECK: strrchr
+// CHECK-LABEL: t9
+// CHECK: call{{.*}}@strrchr{{.*}} [[ATTR]]
 
 int t10(void) { return strcmp("foo", "bar"); }
-// CHECK: t10
-// CHECK: strcmp
+// CHECK-LABEL: t10
+// CHECK: call{{.*}}@strcmp{{.*}} [[ATTR]]
 
 int t11(void) { return strncmp("foo", "bar", 3); }
-// CHECK: t11
-// CHECK: strncmp
+// CHECK-LABEL: t11
+// CHECK: call{{.*}}@strncmp{{.*}} [[ATTR]]
 
 char *t12(char *x) { return strcpy(x, "foo"); }
-// CHECK: t12
-// CHECK: strcpy
+// CHECK-LABEL: t12
+// CHECK: call{{.*}}@strcpy{{.*}} [[ATTR]]
 
 char *t13(char *x) { return stpcpy(x, "foo"); }
-// CHECK: t13
-// CHECK: stpcpy
+// CHECK-LABEL: t13
+// CHECK: call{{.*}}@stpcpy{{.*}} [[ATTR]]
 
 char *t14(char *x) { return strncpy(x, "foo", 3); }
-// CHECK: t14
-// CHECK: strncpy
+// CHECK-LABEL: t14
+// CHECK: call{{.*}}@strncpy{{.*}} [[ATTR]]
 
 size_t t15(void) { return strlen("foo"); }
-// CHECK: t15
-// CHECK: strlen
+// CHECK-LABEL: t15
+// CHECK: call{{.*}}@strlen{{.*}} [[ATTR]]
 
 char *t16(char *x) { return strpbrk(x, ""); }
-// CHECK: t16
-// CHECK: strpbrk
+// CHECK-LABEL: t16
+// CHECK: call{{.*}}@strpbrk{{.*}} [[ATTR]]
 
 size_t t17(char *x) { return strspn(x, ""); }
-// CHECK: t17
-// CHECK: strspn
+// CHECK-LABEL: t17
+// CHECK: call{{.*}}@strspn{{.*}} [[ATTR]]
 
 double t18(char **x) { return strtod("123.4", x); }
-// CHECK: t18
-// CHECK: strtod
+// CHECK-LABEL: t18
+// CHECK: call{{.*}}@strtod{{.*}} [[ATTR]]
 
 float t19(char **x) { return strtof("123.4", x); }
-// CHECK: t19
-// CHECK: strtof
+// CHECK-LABEL: t19
+// CHECK: call{{.*}}@strtof{{.*}} [[ATTR]]
 
 long double t20(char **x) { return strtold("123.4", x); }
-// CHECK: t20
-// CHECK: strtold
+// CHECK-LABEL: t20
+// CHECK: call{{.*}}@strtold{{.*}} [[ATTR]]
 
 long int t21(char **x) { return strtol("1234", x, 10); }
-// CHECK: t21
-// CHECK: strtol
+// CHECK-LABEL: t21
+// CHECK: call{{.*}}@strtol{{.*}} [[ATTR]]
 
 long int t22(char **x) { return strtoll("1234", x, 10); }
-// CHECK: t22
-// CHECK: strtoll
+// CHECK-LABEL: t22
+// CHECK: call{{.*}}@strtoll{{.*}} [[ATTR]]
 
 long int t23(char **x) { return strtoul("1234", x, 10); }
-// CHECK: t23
-// CHECK: strtoul
+// CHECK-LABEL: t23
+// CHECK: call{{.*}}@strtoul{{.*}} [[ATTR]]
 
 long int t24(char **x) { return strtoull("1234", x, 10); }
-// CHECK: t24
-// CHECK: strtoull
+// CHECK-LABEL: t24
+// CHECK: call{{.*}}@strtoull{{.*}} [[ATTR]]
+
+void t25(FILE *fp, int *buf) {
+  size_t x = fwrite(buf, sizeof(int), 10, fp);
+  size_t y = fread(buf, sizeof(int), 10, fp);
+}
+// CHECK-LABEL: t25
+// CHECK: call{{.*}}@fwrite{{.*}} [[ATTR]]
+// CHECK: call{{.*}}@fread{{.*}} [[ATTR]]
+
+FILE *t26(const char *path, const char *mode) {
+  return fopen(path, mode);
+}
+// CHECK-LABEL: t26
+// CHECK: call{{.*}}@fopen{{.*}} [[ATTR]]
+
+// CHECK: [[ATTR]] = { nobuiltin }
diff --git a/test/CodeGen/lto-newpm-pipeline.c b/test/CodeGen/lto-newpm-pipeline.c
index 10112da6d508..137c46c964cc 100644
--- a/test/CodeGen/lto-newpm-pipeline.c
+++ b/test/CodeGen/lto-newpm-pipeline.c
@@ -27,6 +27,7 @@
 
 // CHECK-FULL-O0: Starting llvm::Module pass manager run.
 // CHECK-FULL-O0: Running pass: AlwaysInlinerPass
+// CHECK-FULL-O0-NEXT: Running pass: NameAnonGlobalPass
 // CHECK-FULL-O0-NEXT: Running pass: BitcodeWriterPass
 // CHECK-FULL-O0: Finished llvm::Module pass manager run.
 
diff --git a/test/CodeGen/lzcnt-builtins.c b/test/CodeGen/lzcnt-builtins.c
index cc5d458c7630..b1414a3b56f2 100644
--- a/test/CodeGen/lzcnt-builtins.c
+++ b/test/CodeGen/lzcnt-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +lzcnt -emit-llvm -o - | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 unsigned short test__lzcnt16(unsigned short __X)
 {
diff --git a/test/CodeGen/mangle-ms-string-literals.c b/test/CodeGen/mangle-ms-string-literals.c
new file mode 100644
index 000000000000..41dd78c1fa72
--- /dev/null
+++ b/test/CodeGen/mangle-ms-string-literals.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -x c -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
+// RUN: %clang_cc1 -x c -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s
+
+void crbug857442(int x) {
+  // Make sure to handle truncated or padded literals. The truncation is only valid in C.
+  struct {int x; char s[2]; } truncatedAscii = {x, "hello"};
+  // CHECK: "??_C@_01CONKJJHI@he@"
+  struct {int x; char s[16]; } paddedAscii = {x, "hello"};
+  // CHECK: "??_C@_0BA@EAAINDNC@hello?$AA?$AA?$AA?$AA?$AA?$AA?$AA?$AA?$AA?$AA?$AA@"
+}
diff --git a/test/CodeGen/mangle-ms.c b/test/CodeGen/mangle-ms.c
index 042c72e6d794..13d23c6e6902 100644
--- a/test/CodeGen/mangle-ms.c
+++ b/test/CodeGen/mangle-ms.c
@@ -1,13 +1,13 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s
 
-// CHECK: define void @"\01?f@@$$J0YAXP6AX@Z@Z"
+// CHECK: define dso_local void @"?f@@$$J0YAXP6AX@Z@Z"
 __attribute__((overloadable)) void f(void (*x)()) {}
 
-// CHECK: define void @f
+// CHECK: define dso_local void @f
 void f(void (*x)(int)) {}
 
-// CHECK: define void @g
+// CHECK: define dso_local void @g
 void g(void (*x)(int)) {}
 
-// CHECK: define void @"\01?g@@$$J0YAXP6AX@Z@Z"
+// CHECK: define dso_local void @"?g@@$$J0YAXP6AX@Z@Z"
 __attribute__((overloadable)) void g(void (*x)()) {}
diff --git a/test/CodeGen/mangle-windows-rtd.c b/test/CodeGen/mangle-windows-rtd.c
index fc6f309eaf58..a22cba2e7ef9 100644
--- a/test/CodeGen/mangle-windows-rtd.c
+++ b/test/CodeGen/mangle-windows-rtd.c
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -emit-llvm -mrtd %s -o - -triple=i386-mingw32 | FileCheck %s
 
 void f1(void) {}
-// CHECK: define x86_stdcallcc void @"\01_f1@0"
+// CHECK: define dso_local x86_stdcallcc void @"\01_f1@0"
 
 void __stdcall f2(void) {}
-// CHECK: define x86_stdcallcc void @"\01_f2@0"
+// CHECK: define dso_local x86_stdcallcc void @"\01_f2@0"
 
 void __fastcall f3(void) {}
-// CHECK: define x86_fastcallcc void @"\01@f3@0"
+// CHECK: define dso_local x86_fastcallcc void @"\01@f3@0"
diff --git a/test/CodeGen/mangle-windows.c b/test/CodeGen/mangle-windows.c
index db39425da827..6dd14d95e00f 100644
--- a/test/CodeGen/mangle-windows.c
+++ b/test/CodeGen/mangle-windows.c
@@ -11,73 +11,73 @@
 // ELF64: target datalayout = "e-m:e-{{.*}}"
 
 void __stdcall f1(void) {}
-// CHECK: define x86_stdcallcc void @"\01_f1@0"
-// X64: define void @f1(
+// CHECK: define dso_local x86_stdcallcc void @"\01_f1@0"
+// X64: define dso_local void @f1(
 // ELF32: define x86_stdcallcc void @"\01_f1@0"
 // ELF64: define void @f1(
 
 void __fastcall f2(void) {}
-// CHECK: define x86_fastcallcc void @"\01@f2@0"
-// X64: define void @f2(
+// CHECK: define dso_local x86_fastcallcc void @"\01@f2@0"
+// X64: define dso_local void @f2(
 // ELF32: define x86_fastcallcc void @"\01@f2@0"
 // ELF64: define void @f2(
 
 void __stdcall f3() {}
-// CHECK: define x86_stdcallcc void @"\01_f3@0"
-// X64: define void @f3(
+// CHECK: define dso_local x86_stdcallcc void @"\01_f3@0"
+// X64: define dso_local void @f3(
 
 void __fastcall f4(char a) {}
-// CHECK: define x86_fastcallcc void @"\01@f4@4"
-// X64: define void @f4(
+// CHECK: define dso_local x86_fastcallcc void @"\01@f4@4"
+// X64: define dso_local void @f4(
 
 void __fastcall f5(short a) {}
-// CHECK: define x86_fastcallcc void @"\01@f5@4"
-// X64: define void @f5(
+// CHECK: define dso_local x86_fastcallcc void @"\01@f5@4"
+// X64: define dso_local void @f5(
 
 void __fastcall f6(int a) {}
-// CHECK: define x86_fastcallcc void @"\01@f6@4"
-// X64: define void @f6(
+// CHECK: define dso_local x86_fastcallcc void @"\01@f6@4"
+// X64: define dso_local void @f6(
 
 void __fastcall f7(long a) {}
-// CHECK: define x86_fastcallcc void @"\01@f7@4"
-// X64: define void @f7(
+// CHECK: define dso_local x86_fastcallcc void @"\01@f7@4"
+// X64: define dso_local void @f7(
 
 void __fastcall f8(long long a) {}
-// CHECK: define x86_fastcallcc void @"\01@f8@8"
-// X64: define void @f8(
+// CHECK: define dso_local x86_fastcallcc void @"\01@f8@8"
+// X64: define dso_local void @f8(
 
 void __fastcall f9(long long a, char b, char c, short d) {}
-// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8 signext %c, i16 signext %d)
-// X64: define void @f9(
+// CHECK: define dso_local x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8 signext %c, i16 signext %d)
+// X64: define dso_local void @f9(
 
 void f12(void) {}
-// CHECK: define void @f12(
-// X64: define void @f12(
+// CHECK: define dso_local void @f12(
+// X64: define dso_local void @f12(
 
 void __vectorcall v1(void) {}
-// CHECK: define x86_vectorcallcc void @"\01v1@@0"(
-// X64: define x86_vectorcallcc void @"\01v1@@0"(
+// CHECK: define dso_local x86_vectorcallcc void @"\01v1@@0"(
+// X64: define dso_local x86_vectorcallcc void @"\01v1@@0"(
 // ELF32: define x86_vectorcallcc void @"\01v1@@0"(
 // ELF64: define x86_vectorcallcc void @"\01v1@@0"(
 
 void __vectorcall v2(char a) {}
-// CHECK: define x86_vectorcallcc void @"\01v2@@4"(
-// X64: define x86_vectorcallcc void @"\01v2@@8"(
+// CHECK: define dso_local x86_vectorcallcc void @"\01v2@@4"(
+// X64: define dso_local x86_vectorcallcc void @"\01v2@@8"(
 // ELF32: define x86_vectorcallcc void @"\01v2@@4"(
 // ELF64: define x86_vectorcallcc void @"\01v2@@8"(
 
 void __vectorcall v3(short a) {}
-// CHECK: define x86_vectorcallcc void @"\01v3@@4"(
-// X64: define x86_vectorcallcc void @"\01v3@@8"(
+// CHECK: define dso_local x86_vectorcallcc void @"\01v3@@4"(
+// X64: define dso_local x86_vectorcallcc void @"\01v3@@8"(
 
 void __vectorcall v4(int a) {}
-// CHECK: define x86_vectorcallcc void @"\01v4@@4"(
-// X64: define x86_vectorcallcc void @"\01v4@@8"(
+// CHECK: define dso_local x86_vectorcallcc void @"\01v4@@4"(
+// X64: define dso_local x86_vectorcallcc void @"\01v4@@8"(
 
 void __vectorcall v5(long long a) {}
-// CHECK: define x86_vectorcallcc void @"\01v5@@8"(
-// X64: define x86_vectorcallcc void @"\01v5@@8"(
+// CHECK: define dso_local x86_vectorcallcc void @"\01v5@@8"(
+// X64: define dso_local x86_vectorcallcc void @"\01v5@@8"(
 
 void __vectorcall v6(char a, char b) {}
-// CHECK: define x86_vectorcallcc void @"\01v6@@8"(
-// X64: define x86_vectorcallcc void @"\01v6@@16"(
+// CHECK: define dso_local x86_vectorcallcc void @"\01v6@@8"(
+// X64: define dso_local x86_vectorcallcc void @"\01v6@@16"(
diff --git a/test/CodeGen/math-builtins.c b/test/CodeGen/math-builtins.c
index 799d91b4ec00..82f049bdd12e 100644
--- a/test/CodeGen/math-builtins.c
+++ b/test/CodeGen/math-builtins.c
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm              %s | FileCheck %s -check-prefix=NO__ERRNO
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown-android -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_ANDROID
 // RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN
 
 // Test attributes and codegen of math builtins.
@@ -24,23 +25,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO: declare float @atan2f(float, float) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @atan2l(x86_fp80, x86_fp80) [[NOT_READNONE]]
 
-  __builtin_copysign(f,f); __builtin_copysignf(f,f);__builtin_copysignl(f,f);
+  __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f);
 
 // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
 // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
 // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
 
-  __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f);
+  __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f); __builtin_fabsf128(f);
 
 // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
 
   __builtin_frexp(f,i);    __builtin_frexpf(f,i);   __builtin_frexpl(f,i);
 
@@ -51,14 +56,14 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO: declare float @frexpf(float, i32*) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @frexpl(x86_fp80, i32*) [[NOT_READNONE]]
 
-  __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall();
+  __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall(); __builtin_huge_valf128();
 
 // NO__ERRNO-NOT: .huge
 // NO__ERRNO-NOT: @huge
 // HAS_ERRNO-NOT: .huge
 // HAS_ERRNO-NOT: @huge
 
-  __builtin_inf();    __builtin_inff();   __builtin_infl();
+  __builtin_inf();    __builtin_inff();   __builtin_infl(); __builtin_inff128();
 
 // NO__ERRNO-NOT: .inf
 // NO__ERRNO-NOT: @inf
@@ -83,23 +88,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO: declare float @modff(float, float*) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80, x86_fp80*) [[NOT_READNONE]]
 
-  __builtin_nan(c);        __builtin_nanf(c);       __builtin_nanl(c);  
+  __builtin_nan(c);        __builtin_nanf(c);       __builtin_nanl(c); __builtin_nanf128(c);
 
-// NO__ERRNO: declare double @nan(i8*) [[READNONE]]
-// NO__ERRNO: declare float @nanf(i8*) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @nanl(i8*) [[READNONE]]
-// HAS_ERRNO: declare double @nan(i8*) [[READNONE:#[0-9]+]]
-// HAS_ERRNO: declare float @nanf(i8*) [[READNONE]]
-// HAS_ERRNO: declare x86_fp80 @nanl(i8*) [[READNONE]]
+// NO__ERRNO: declare double @nan(i8*) [[PURE:#[0-9]+]]
+// NO__ERRNO: declare float @nanf(i8*) [[PURE]]
+// NO__ERRNO: declare x86_fp80 @nanl(i8*) [[PURE]]
+// NO__ERRNO: declare fp128 @nanf128(i8*) [[PURE]]
+// HAS_ERRNO: declare double @nan(i8*) [[PURE:#[0-9]+]]
+// HAS_ERRNO: declare float @nanf(i8*) [[PURE]]
+// HAS_ERRNO: declare x86_fp80 @nanl(i8*) [[PURE]]
+// HAS_ERRNO: declare fp128 @nanf128(i8*) [[PURE]]
 
-  __builtin_nans(c);        __builtin_nansf(c);       __builtin_nansl(c);  
+  __builtin_nans(c);        __builtin_nansf(c);       __builtin_nansl(c); __builtin_nansf128(c);
 
-// NO__ERRNO: declare double @nans(i8*) [[READNONE]]
-// NO__ERRNO: declare float @nansf(i8*) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @nansl(i8*) [[READNONE]]
-// HAS_ERRNO: declare double @nans(i8*) [[READNONE]]
-// HAS_ERRNO: declare float @nansf(i8*) [[READNONE]]
-// HAS_ERRNO: declare x86_fp80 @nansl(i8*) [[READNONE]]
+// NO__ERRNO: declare double @nans(i8*) [[PURE]]
+// NO__ERRNO: declare float @nansf(i8*) [[PURE]]
+// NO__ERRNO: declare x86_fp80 @nansl(i8*) [[PURE]]
+// NO__ERRNO: declare fp128 @nansf128(i8*) [[PURE]]
+// HAS_ERRNO: declare double @nans(i8*) [[PURE]]
+// HAS_ERRNO: declare float @nansf(i8*) [[PURE]]
+// HAS_ERRNO: declare x86_fp80 @nansl(i8*) [[PURE]]
+// HAS_ERRNO: declare fp128 @nansf128(i8*) [[PURE]]
 
   __builtin_pow(f,f);        __builtin_powf(f,f);       __builtin_powl(f,f);
 
@@ -179,7 +188,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // NO__ERRNO: declare double @cbrt(double) [[READNONE]]
 // NO__ERRNO: declare float @cbrtf(float) [[READNONE]]
 // NO__ERRNO: declare x86_fp80 @cbrtl(x86_fp80) [[READNONE]]
-// HAS_ERRNO: declare double @cbrt(double) [[READNONE]]
+// HAS_ERRNO: declare double @cbrt(double) [[READNONE:#[0-9]+]]
 // HAS_ERRNO: declare float @cbrtf(float) [[READNONE]]
 // HAS_ERRNO: declare x86_fp80 @cbrtl(x86_fp80) [[READNONE]]
 
@@ -288,6 +297,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // HAS_ERRNO_GNU: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO_GNU: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
 
+// HAS_ERRNO_ANDROID: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC:#[0-9]+]]
+// HAS_ERRNO_ANDROID: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
+// HAS_ERRNO_ANDROID: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+
 // HAS_ERRNO_WIN: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC:#[0-9]+]]
 // HAS_ERRNO_WIN: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
 // Long double is just double on win, so no f80 use/declaration.
@@ -568,11 +581,14 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // NO__ERRNO: attributes [[READNONE]] = { {{.*}}readnone{{.*}} }
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}readnone{{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind "correctly{{.*}} }
+// NO__ERRNO: attributes [[PURE]] = { {{.*}}readonly{{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind "correctly{{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}readnone{{.*}} }
+// HAS_ERRNO: attributes [[PURE]] = { {{.*}}readonly{{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}readnone{{.*}} }
 
 // HAS_ERRNO_GNU: attributes [[READNONE_INTRINSIC]] = { {{.*}}readnone{{.*}} }
+// HAS_ERRNO_ANDROID: attributes [[READNONE_INTRINSIC]] = { {{.*}}readnone{{.*}} }
 // HAS_ERRNO_WIN: attributes [[READNONE_INTRINSIC]] = { {{.*}}readnone{{.*}} }
 
diff --git a/test/CodeGen/may-alias.c b/test/CodeGen/may-alias.c
index 41eda8587068..cba62af4bb92 100644
--- a/test/CodeGen/may-alias.c
+++ b/test/CodeGen/may-alias.c
@@ -1,18 +1,24 @@
-// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 -no-struct-path-tbaa -disable-llvm-passes -o - %s | FileCheck %s
-// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 -disable-llvm-passes -o - %s | FileCheck %s -check-prefix=PATH
+// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 \
+// RUN:     -no-struct-path-tbaa -disable-llvm-passes -o - %s | \
+// RUN:     FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=CHECK,SCALAR
+// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 \
+// RUN:     -disable-llvm-passes -o - %s | \
+// RUN:     FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -Werror -triple i386-unknown-unknown -emit-llvm -O1 \
+// RUN:     -new-struct-path-tbaa -disable-llvm-passes -o - %s | \
+// RUN:     FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=CHECK,NEW-PATH
 
 // Types with the may_alias attribute should be considered equivalent
 // to char for aliasing.
 
 typedef int __attribute__((may_alias)) aliasing_int;
 
-void test0(aliasing_int *ai, int *i)
-{
-// CHECK: store i32 0, i32* %{{.*}}, !tbaa [[TAG_CHAR:!.*]]
-// PATH: store i32 0, i32* %{{.*}}, !tbaa [[TAG_CHAR:!.*]]
+void test0(aliasing_int *ai, int *i) {
+// CHECK-LABEL: test0
+// CHECK: store i32 0, {{.*}}, !tbaa [[TAG_alias_int:!.*]]
   *ai = 0;
-// CHECK: store i32 1, i32* %{{.*}}, !tbaa [[TAG_INT:!.*]]
-// PATH: store i32 1, i32* %{{.*}}, !tbaa [[TAG_INT:!.*]]
+
+// CHECK: store i32 1, {{.*}}, !tbaa [[TAG_int:!.*]]
   *i = 1;
 }
 
@@ -20,23 +26,36 @@ void test0(aliasing_int *ai, int *i)
 struct Test1 { int x; };
 struct Test1MA { int x; } __attribute__((may_alias));
 void test1(struct Test1MA *p1, struct Test1 *p2) {
-  // CHECK: store i32 2, i32* {{%.*}}, !tbaa [[TAG_CHAR]]
-  // PATH: store i32 2, i32* {{%.*}}, !tbaa [[TAG_CHAR]]
+// CHECK-LABEL: test1
+// CHECK: store i32 2, {{.*}}, !tbaa [[TAG_alias_test1_x:!.*]]
   p1->x = 2;
-  // CHECK: store i32 3, i32* {{%.*}}, !tbaa [[TAG_INT]]
-  // PATH: store i32 3, i32* {{%.*}}, !tbaa [[TAG_test1_x:!.*]]
+
+// CHECK: store i32 3, {{.*}}, !tbaa [[TAG_test1_x:!.*]]
   p2->x = 3;
 }
-// CHECK:  !"any pointer", [[TYPE_CHAR:!.*]],
-// CHECK: [[TYPE_CHAR]] = !{!"omnipotent char", [[TAG_CXX_TBAA:!.*]],
-// CHECK: [[TAG_CXX_TBAA]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[TAG_CHAR]] = !{[[TYPE_CHAR]], [[TYPE_CHAR]], i64 0}
-// CHECK: [[TAG_INT]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
-// CHECK: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR]]
-
-// PATH: [[TYPE_CHAR:!.*]] = !{!"omnipotent char", !{{.*}}
-// PATH: [[TAG_CHAR]] = !{[[TYPE_CHAR]], [[TYPE_CHAR]], i64 0}
-// PATH: [[TAG_INT]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
-// PATH: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR]]
-// PATH: [[TAG_test1_x]] = !{[[TYPE_test1:!.*]], [[TYPE_INT]], i64 0}
-// PATH: [[TYPE_test1]] = !{!"Test1", [[TYPE_INT]], i64 0}
+
+// SCALAR-DAG: [[ROOT:!.*]] = !{!"Simple C/C++ TBAA"}
+// SCALAR-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", [[ROOT]], i64 0}
+// SCALAR-DAG: [[TAG_alias_int]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
+// SCALAR-DAG: [[TAG_alias_test1_x]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
+// SCALAR-DAG: [[TYPE_int:!.*]] = !{!"int", [[TYPE_char]], i64 0}
+// SCALAR-DAG: [[TAG_int]] = !{[[TYPE_int]], [[TYPE_int]], i64 0}
+// SCALAR-DAG: [[TAG_test1_x]] = !{[[TYPE_int]], [[TYPE_int]], i64 0}
+
+// OLD-PATH-DAG: [[ROOT:!.*]] = !{!"Simple C/C++ TBAA"}
+// OLD-PATH-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", [[ROOT]], i64 0}
+// OLD-PATH-DAG: [[TAG_alias_int]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
+// OLD-PATH-DAG: [[TAG_alias_test1_x]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
+// OLD-PATH-DAG: [[TYPE_int:!.*]] = !{!"int", [[TYPE_char]], i64 0}
+// OLD-PATH-DAG: [[TAG_int]] = !{[[TYPE_int]], [[TYPE_int]], i64 0}
+// OLD-PATH-DAG: [[TYPE_test1:!.*]] = !{!"Test1", [[TYPE_int]], i64 0}
+// OLD-PATH-DAG: [[TAG_test1_x]] = !{[[TYPE_test1]], [[TYPE_int]], i64 0}
+
+// NEW-PATH-DAG: [[ROOT:!.*]] = !{!"Simple C/C++ TBAA"}
+// NEW-PATH-DAG: [[TYPE_char:!.*]] = !{[[ROOT]], i64 1, !"omnipotent char"}
+// NEW-PATH-DAG: [[TAG_alias_int]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
+// NEW-PATH-DAG: [[TAG_alias_test1_x]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
+// NEW-PATH-DAG: [[TYPE_int:!.*]] = !{[[TYPE_char]], i64 4, !"int"}
+// NEW-PATH-DAG: [[TAG_int]] = !{[[TYPE_int]], [[TYPE_int]], i64 0, i64 4}
+// NEW-PATH-DAG: [[TYPE_test1:!.*]] = !{[[TYPE_char]], i64 4, !"Test1", [[TYPE_int]], i64 0, i64 4}
+// NEW-PATH-DAG: [[TAG_test1_x]] = !{[[TYPE_test1]], [[TYPE_int]], i64 0, i64 4}
diff --git a/test/CodeGen/mbackchain-2.c b/test/CodeGen/mbackchain-2.c
index e76afaf7687a..bea46d248263 100644
--- a/test/CodeGen/mbackchain-2.c
+++ b/test/CodeGen/mbackchain-2.c
@@ -1,6 +1,6 @@
 // RUN: %clang -mbackchain --target=s390x-linux -S -emit-llvm -o - %s | FileCheck %s
 
-// CHECK: define void @foo() [[NUW:#[0-9]+]]
+// CHECK: define dso_local void @foo() [[NUW:#[0-9]+]]
 void foo(void) {
 }
 
diff --git a/test/CodeGen/mbackchain-3.c b/test/CodeGen/mbackchain-3.c
index b115861f5e02..5ff0e1676100 100644
--- a/test/CodeGen/mbackchain-3.c
+++ b/test/CodeGen/mbackchain-3.c
@@ -1,6 +1,6 @@
 // RUN: %clang -mno-backchain --target=s390x-linux -S -emit-llvm -o - %s | FileCheck %s
 
-// CHECK: define void @foo() [[NUW:#[0-9]+]]
+// CHECK: define dso_local void @foo() [[NUW:#[0-9]+]]
 void foo(void) {
 }
 
diff --git a/test/CodeGen/mcount.c b/test/CodeGen/mcount.c
index 98a2a6b39092..c67cf03c0016 100644
--- a/test/CodeGen/mcount.c
+++ b/test/CodeGen/mcount.c
@@ -3,20 +3,20 @@
 // RUN: %clang_cc1 -pg -triple powerpc-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple powerpc64-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple powerpc64le-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple i386-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple x86_64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple arm-netbsd-eabi -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple aarch64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple mips-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple i386-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple x86_64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple arm-netbsd-eabi -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple aarch64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple mips-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mips-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mipsel-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mips64-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mips64el-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple powerpc-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple powerpc64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple powerpc64le-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple sparc-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-// RUN: %clang_cc1 -pg -triple sparc64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple powerpc-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple powerpc64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple powerpc64le-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple sparc-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple sparc64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s -check-prefix=NO-MCOUNT
 
 int bar(void) {
@@ -39,5 +39,7 @@ int main(void) {
 // CHECK: attributes #1 = { {{.*}} }
 // CHECK-PREFIXED: attributes #0 = { {{.*}}"instrument-function-entry-inlined"="_mcount"{{.*}} }
 // CHECK-PREFIXED: attributes #1 = { {{.*}} }
+// CHECK-DOUBLE-PREFIXED: attributes #0 = { {{.*}}"instrument-function-entry-inlined"="__mcount"{{.*}} }
+// CHECK-DOUBLE-PREFIXED: attributes #1 = { {{.*}} }
 // NO-MCOUNT-NOT: attributes #{{[0-9]}} = { {{.*}}"instrument-function-entry-inlined"={{.*}} }
 // NO-MCOUNT1-NOT: attributes #1 = { {{.*}}"instrument-function-entry-inlined"={{.*}} }
diff --git a/test/CodeGen/microsoft-call-conv-x64.c b/test/CodeGen/microsoft-call-conv-x64.c
index 6475dfa2936d..5be748f94c5a 100644
--- a/test/CodeGen/microsoft-call-conv-x64.c
+++ b/test/CodeGen/microsoft-call-conv-x64.c
@@ -3,12 +3,12 @@
 void __fastcall f1(void);
 void __stdcall f2(void);
 void __fastcall f4(void) {
-// CHECK-LABEL: define void @f4()
+// CHECK-LABEL: define dso_local void @f4()
   f1();
 // CHECK: call void @f1()
 }
 void __stdcall f5(void) {
-// CHECK-LABEL: define void @f5()
+// CHECK-LABEL: define dso_local void @f5()
   f2();
 // CHECK: call void @f2()
 }
diff --git a/test/CodeGen/mingw-long-double.c b/test/CodeGen/mingw-long-double.c
index 1c7c31f88be3..6026c24b367d 100644
--- a/test/CodeGen/mingw-long-double.c
+++ b/test/CodeGen/mingw-long-double.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple i686-windows-gnu -emit-llvm -o - %s \
 // RUN:    | FileCheck %s --check-prefix=GNU32
+// RUN: %clang_cc1 -triple i686-windows-gnu -emit-llvm -o - %s -mms-bitfields \
+// RUN:    | FileCheck %s --check-prefix=GNU32
 // RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s \
 // RUN:    | FileCheck %s --check-prefix=GNU64
 // RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -o - %s \
@@ -10,32 +12,36 @@ struct {
   long double ldb;
 } agggregate_LD = {};
 // GNU32: %struct.anon = type { i8, x86_fp80 }
-// GNU32: @agggregate_LD = global %struct.anon zeroinitializer, align 4
+// GNU32: @agggregate_LD = dso_local global %struct.anon zeroinitializer, align 4
 // GNU64: %struct.anon = type { i8, x86_fp80 }
-// GNU64: @agggregate_LD = global %struct.anon zeroinitializer, align 16
+// GNU64: @agggregate_LD = dso_local global %struct.anon zeroinitializer, align 16
 // MSC64: %struct.anon = type { i8, double }
-// MSC64: @agggregate_LD = global %struct.anon zeroinitializer, align 8
+// MSC64: @agggregate_LD = dso_local global %struct.anon zeroinitializer, align 8
 
 long double dataLD = 1.0L;
-// GNU32: @dataLD = global x86_fp80 0xK3FFF8000000000000000, align 4
-// GNU64: @dataLD = global x86_fp80 0xK3FFF8000000000000000, align 16
-// MSC64: @dataLD = global double 1.000000e+00, align 8
+// GNU32: @dataLD = dso_local global x86_fp80 0xK3FFF8000000000000000, align 4
+// GNU64: @dataLD = dso_local global x86_fp80 0xK3FFF8000000000000000, align 16
+// MSC64: @dataLD = dso_local global double 1.000000e+00, align 8
 
 long double _Complex dataLDC = {1.0L, 1.0L};
-// GNU32: @dataLDC = global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 4
-// GNU64: @dataLDC = global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 16
-// MSC64: @dataLDC = global { double, double } { double 1.000000e+00, double 1.000000e+00 }, align 8
+// GNU32: @dataLDC = dso_local global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 4
+// GNU64: @dataLDC = dso_local global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 16
+// MSC64: @dataLDC = dso_local global { double, double } { double 1.000000e+00, double 1.000000e+00 }, align 8
 
 long double TestLD(long double x) {
   return x * x;
 }
-// GNU32: define x86_fp80 @TestLD(x86_fp80 %x)
-// GNU64: define void @TestLD(x86_fp80* noalias sret %agg.result, x86_fp80*)
-// MSC64: define double @TestLD(double %x)
+// GNU32: define dso_local x86_fp80 @TestLD(x86_fp80 %x)
+// GNU64: define dso_local void @TestLD(x86_fp80* noalias sret %agg.result, x86_fp80*)
+// MSC64: define dso_local double @TestLD(double %x)
 
 long double _Complex TestLDC(long double _Complex x) {
   return x * x;
 }
-// GNU32: define void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %x)
-// GNU64: define void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* %x)
-// MSC64: define void @TestLDC({ double, double }* noalias sret %agg.result, { double, double }* %x)
+// GNU32: define dso_local void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %x)
+// GNU64: define dso_local void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* %x)
+// MSC64: define dso_local void @TestLDC({ double, double }* noalias sret %agg.result, { double, double }* %x)
+
+// GNU32: declare dso_local void @__mulxc3
+// GNU64: declare dso_local void @__mulxc3
+// MSC64: declare dso_local void @__muldc3
diff --git a/test/CodeGen/mips-vector-return.c b/test/CodeGen/mips-vector-return.c
index 8af4998cdf7c..dd3c400c0197 100644
--- a/test/CodeGen/mips-vector-return.c
+++ b/test/CodeGen/mips-vector-return.c
@@ -8,13 +8,13 @@ typedef float  v4sf __attribute__ ((__vector_size__ (16)));
 typedef double v4df __attribute__ ((__vector_size__ (32)));
 typedef int v4i32 __attribute__ ((__vector_size__ (16)));
 
-// O32-LABEL: define void @test_v4sf(<4 x float>* noalias nocapture sret
+// O32-LABEL: define dso_local void @test_v4sf(<4 x float>* noalias nocapture sret
 // N64: define inreg { i64, i64 } @test_v4sf
 v4sf test_v4sf(float a) {
   return (v4sf){0.0f, a, 0.0f, 0.0f};
 }
 
-// O32-LABEL: define void @test_v4df(<4 x double>* noalias nocapture sret
+// O32-LABEL: define dso_local void @test_v4df(<4 x double>* noalias nocapture sret
 // N64-LABEL: define void @test_v4df(<4 x double>* noalias nocapture sret
 v4df test_v4df(double a) {
   return (v4df){0.0, a, 0.0, 0.0};
@@ -23,7 +23,7 @@ v4df test_v4df(double a) {
 // O32 returns integer vectors whose size is equal to or smaller than 16-bytes
 // in integer registers.
 //
-// O32: define inreg { i32, i32, i32, i32 } @test_v4i32
+// O32: define dso_local inreg { i32, i32, i32, i32 } @test_v4i32
 // N64: define inreg { i64, i64 } @test_v4i32
 v4i32 test_v4i32(int a) {
   return (v4i32){0, a, 0, 0};
diff --git a/test/CodeGen/mms-bitfields.c b/test/CodeGen/mms-bitfields.c
index 1617e8ac40d9..d4604fbaa44e 100644
--- a/test/CodeGen/mms-bitfields.c
+++ b/test/CodeGen/mms-bitfields.c
@@ -20,3 +20,46 @@ struct s3 {
 } s3;
 
 // CHECK: %struct.s3 = type { i32, [4 x i8], %struct.s1 }
+
+// PR32482:
+
+#pragma pack (push,1)
+
+typedef unsigned int UINT32;
+
+struct Inner {
+  UINT32    A    :  1;
+  UINT32    B    :  1;
+  UINT32    C    :  1;
+  UINT32    D    : 30;
+} Inner;
+
+#pragma pack (pop)
+
+// CHECK: %struct.Inner = type { i32, i32 }
+
+// CHECK: %struct.A = type { i32, i32, i32 }
+
+#pragma pack(push, 1)
+
+union HEADER {
+  struct A {
+    int                                         :  3;  // Bits 2:0
+    int a                                       :  9;  // Bits 11:3
+    int                                         :  12;  // Bits 23:12
+    int b                                       :  17;  // Bits 40:24
+    int                                         :  7;  // Bits 47:41
+    int c                                       :  4;  // Bits 51:48
+    int                                         :  4;  // Bits 55:52
+    int d                                       :  3;  // Bits 58:56
+    int                                         :  5;  // Bits 63:59
+  } Bits;
+} HEADER;
+
+#pragma pack(pop)
+
+struct Inner variable = { 1,0,1, 21 };
+union HEADER hdr = {{1,2,3,4}};
+
+// CHECK: @variable = global { i8, [3 x i8], i8, i8, i8, i8 } { i8 5, [3 x i8] undef, i8 21, i8 0, i8 0, i8 0 }, align 1
+// CHECK: @hdr = global { { i8, i8, [2 x i8], i8, i8, i8, i8, i8, [3 x i8] } } { { i8, i8, [2 x i8], i8, i8, i8, i8, i8, [3 x i8] } { i8 8, i8 0, [2 x i8] undef, i8 2, i8 0, i8 0, i8 3, i8 4, [3 x i8] undef } }, align 1
diff --git a/test/CodeGen/mmx-builtins.c b/test/CodeGen/mmx-builtins.c
index cd725e22b83c..ab01c7bd5a52 100644
--- a/test/CodeGen/mmx-builtins.c
+++ b/test/CodeGen/mmx-builtins.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 __m64 test_mm_abs_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi8
diff --git a/test/CodeGen/ms-align-tentative.c b/test/CodeGen/ms-align-tentative.c
index eb68e69f586b..ae9b4b10241c 100644
--- a/test/CodeGen/ms-align-tentative.c
+++ b/test/CodeGen/ms-align-tentative.c
@@ -1,25 +1,25 @@
 // RUN: %clang_cc1 -triple i386-pc-win32 -emit-llvm -fms-compatibility -o - < %s | FileCheck %s
 
 char __declspec(align(8192)) x;
-// CHECK-DAG: @x = global i8 0, align 8192
+// CHECK-DAG: @x = dso_local global i8 0, align 8192
 
 typedef char __declspec(align(8192)) T;
 T y;
-// CHECK-DAG: @y = global i8 0, align 8192
+// CHECK-DAG: @y = dso_local global i8 0, align 8192
 
 T __declspec(align(8192)) z;
-// CHECK-DAG: @z = global i8 0, align 8192
+// CHECK-DAG: @z = dso_local global i8 0, align 8192
 
 int __declspec(align(16)) redef;
 int __declspec(align(32)) redef = 8;
-// CHECK-DAG: @redef = global i32 8, align 32
+// CHECK-DAG: @redef = dso_local global i32 8, align 32
 
 struct __declspec(align(64)) S {
   char fd;
 } s;
-// CHECK-DAG: @s = global %struct.S zeroinitializer, align 64
+// CHECK-DAG: @s = dso_local global %struct.S zeroinitializer, align 64
 
 struct Wrap {
   struct S x;
 } w;
-// CHECK-DAG: @w = global %struct.Wrap zeroinitializer, align 64
+// CHECK-DAG: @w = dso_local global %struct.Wrap zeroinitializer, align 64
diff --git a/test/CodeGen/ms-annotation.c b/test/CodeGen/ms-annotation.c
index 6f4a20c7b154..8ad48366bfda 100644
--- a/test/CodeGen/ms-annotation.c
+++ b/test/CodeGen/ms-annotation.c
@@ -11,7 +11,7 @@ void test1(void) {
   __annotation(L"unicode: \u0ca0_\u0ca0");
 }
 
-// CHECK-LABEL: define void @test1()
+// CHECK-LABEL: define dso_local void @test1()
 // CHECK: call void @llvm.codeview.annotation(metadata ![[A1:[0-9]+]])
 // CHECK: call void @llvm.codeview.annotation(metadata ![[A2:[0-9]+]])
 // CHECK: call void @llvm.codeview.annotation(metadata ![[A3:[0-9]+]])
diff --git a/test/CodeGen/ms-barriers-intrinsics.c b/test/CodeGen/ms-barriers-intrinsics.c
index c7da50cd0d83..7f87c9017046 100644
--- a/test/CodeGen/ms-barriers-intrinsics.c
+++ b/test/CodeGen/ms-barriers-intrinsics.c
@@ -12,26 +12,26 @@ typedef __SIZE_TYPE__ size_t;
 #include <intrin.h>
 
 void test_ReadWriteBarrier() { _ReadWriteBarrier(); }
-// CHECK-LABEL: define void @test_ReadWriteBarrier
+// CHECK-LABEL: define dso_local void @test_ReadWriteBarrier
 // CHECK:   fence syncscope("singlethread") seq_cst
 // CHECK:   ret void
 // CHECK: }
 
 void test_ReadBarrier() { _ReadBarrier(); }
-// CHECK-LABEL: define void @test_ReadBarrier
+// CHECK-LABEL: define dso_local void @test_ReadBarrier
 // CHECK:   fence syncscope("singlethread") seq_cst
 // CHECK:   ret void
 // CHECK: }
 
 void test_WriteBarrier() { _WriteBarrier(); }
-// CHECK-LABEL: define void @test_WriteBarrier
+// CHECK-LABEL: define dso_local void @test_WriteBarrier
 // CHECK:   fence syncscope("singlethread") seq_cst
 // CHECK:   ret void
 // CHECK: }
 
 #if defined(__x86_64__)
 void test__faststorefence() { __faststorefence(); }
-// CHECK-X64-LABEL: define void @test__faststorefence
+// CHECK-X64-LABEL: define dso_local void @test__faststorefence
 // CHECK-X64:   fence seq_cst
 // CHECK-X64:   ret void
 // CHECK-X64: }
diff --git a/test/CodeGen/ms-declspecs.c b/test/CodeGen/ms-declspecs.c
index 05810bb4b71c..68d964003ec8 100644
--- a/test/CodeGen/ms-declspecs.c
+++ b/test/CodeGen/ms-declspecs.c
@@ -2,14 +2,14 @@
 
 __declspec(selectany) int x1 = 1;
 const __declspec(selectany) int x2 = 2;
-// CHECK: @x1 = weak_odr global i32 1, comdat, align 4
-// CHECK: @x2 = weak_odr constant i32 2, comdat, align 4
+// CHECK: @x1 = weak_odr dso_local global i32 1, comdat, align 4
+// CHECK: @x2 = weak_odr dso_local constant i32 2, comdat, align 4
 
 // selectany turns extern variable declarations into definitions.
 __declspec(selectany) int x3;
 extern __declspec(selectany) int x4;
-// CHECK: @x3 = weak_odr global i32 0, comdat, align 4
-// CHECK: @x4 = weak_odr global i32 0, comdat, align 4
+// CHECK: @x3 = weak_odr dso_local global i32 0, comdat, align 4
+// CHECK: @x4 = weak_odr dso_local global i32 0, comdat, align 4
 
 struct __declspec(align(16)) S {
   char x;
@@ -19,14 +19,14 @@ union { struct S s; } u;
 // CHECK: @u = {{.*}}zeroinitializer, align 16
 
 
-// CHECK: define void @t3() [[NAKED:#[0-9]+]] {
+// CHECK: define dso_local void @t3() [[NAKED:#[0-9]+]] {
 __declspec(naked) void t3() {}
 
-// CHECK: define void @t22() [[NUW:#[0-9]+]]
+// CHECK: define dso_local void @t22() [[NUW:#[0-9]+]]
 void __declspec(nothrow) t22();
 void t22() {}
 
-// CHECK: define void @t2() [[NI:#[0-9]+]] {
+// CHECK: define dso_local void @t2() [[NI:#[0-9]+]] {
 __declspec(noinline) void t2() {}
 
 // CHECK: call void @f20_t() [[NR:#[0-9]+]]
diff --git a/test/CodeGen/ms-declspecs.cpp b/test/CodeGen/ms-declspecs.cpp
index decf5d6fcb22..99e39f9c9ea7 100644
--- a/test/CodeGen/ms-declspecs.cpp
+++ b/test/CodeGen/ms-declspecs.cpp
@@ -8,8 +8,8 @@ extern "C" {
 __declspec(selectany) int x4;
 }
 __declspec(selectany) int x5;
-// CHECK: @"\01?x1@@3HA" = weak_odr global i32 0, comdat, align 4
-// CHECK: @x2 = weak_odr global i32 0, comdat, align 4
-// CHECK: @"\01?x3@@3HA"  = weak_odr global i32 0, comdat, align 4
-// CHECK: @x4 = weak_odr global i32 0, comdat, align 4
-// CHECK: @"\01?x5@@3HA"  = weak_odr global i32 0, comdat, align 4
+// CHECK: @"?x1@@3HA" = weak_odr dso_local global i32 0, comdat, align 4
+// CHECK: @x2 = weak_odr dso_local global i32 0, comdat, align 4
+// CHECK: @"?x3@@3HA"  = weak_odr dso_local global i32 0, comdat, align 4
+// CHECK: @x4 = weak_odr dso_local global i32 0, comdat, align 4
+// CHECK: @"?x5@@3HA"  = weak_odr dso_local global i32 0, comdat, align 4
diff --git a/test/CodeGen/ms-inline-asm-align.c b/test/CodeGen/ms-inline-asm-align.c
index de896b8e60d6..4786b2fe6e6c 100644
--- a/test/CodeGen/ms-inline-asm-align.c
+++ b/test/CodeGen/ms-inline-asm-align.c
@@ -21,7 +21,7 @@ void align_test() {
 // DARWIN-SAME: .align 8
 // DARWIN-SAME: "~{dirflag},~{fpsr},~{flags}"()
 
-// WINDOWS-LABEL: define void @align_test()
+// WINDOWS-LABEL: define dso_local void @align_test()
 // WINDOWS: call void asm sideeffect inteldialect
 // WINDOWS-SAME: .align 8
 // WINDOWS-SAME: .align 16
diff --git a/test/CodeGen/ms-inline-asm-avx512.c b/test/CodeGen/ms-inline-asm-avx512.c
index 6189e50d2116..76c0a57250b3 100644
--- a/test/CodeGen/ms-inline-asm-avx512.c
+++ b/test/CodeGen/ms-inline-asm-avx512.c
@@ -21,7 +21,7 @@ void t2() {
 }
 
 void ignore_fe_size() {
-  // CHECK-LABEL: define void @ignore_fe_size()
+  // CHECK-LABEL: define dso_local void @ignore_fe_size()
   char c;
   // CHECK: vaddps xmm1, xmm2, $1{1to4}
   __asm vaddps xmm1, xmm2, [c]{1to4}
diff --git a/test/CodeGen/ms-inline-asm.c b/test/CodeGen/ms-inline-asm.c
index 5c3e3ff2a843..edcbaa8aa455 100644
--- a/test/CodeGen/ms-inline-asm.c
+++ b/test/CodeGen/ms-inline-asm.c
@@ -754,6 +754,13 @@ void mxcsr() {
 // CHECK-LABEL: define void @mxcsr
 // CHECK: call void asm sideeffect inteldialect "fxrstor $0", "=*m,~{dirflag},~{fpsr},~{flags}"
 
+// Make sure we can find the register for the dirflag for popfd
+void dirflag() {
+  __asm popfd
+}
+// CHECK-LABEL: define void @dirflag
+// CHECK: call void asm sideeffect inteldialect "popfd", "~{dirflag},~{flags},~{esp},~{dirflag},~{fpsr},~{flags}"
+
 typedef union _LARGE_INTEGER {
   struct {
     unsigned int LowPart;
diff --git a/test/CodeGen/ms-intrinsics-other.c b/test/CodeGen/ms-intrinsics-other.c
index d23bc7301801..65d767058462 100644
--- a/test/CodeGen/ms-intrinsics-other.c
+++ b/test/CodeGen/ms-intrinsics-other.c
@@ -148,14 +148,3 @@ LONG test_InterlockedDecrement(LONG volatile *Addend) {
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], -1
 // CHECK: ret i32 [[RESULT]]
 // CHECK: }
-
-unsigned char test_interlockedbittestandset(volatile LONG *ptr, LONG bit) {
-  return _interlockedbittestandset(ptr, bit);
-}
-// CHECK-LABEL: define{{.*}} i8 @test_interlockedbittestandset
-// CHECK: [[MASKBIT:%[0-9]+]] = shl i32 1, %bit
-// CHECK: [[OLD:%[0-9]+]] = atomicrmw or i32* %ptr, i32 [[MASKBIT]] seq_cst
-// CHECK: [[SHIFT:%[0-9]+]] = lshr i32 [[OLD]], %bit
-// CHECK: [[TRUNC:%[0-9]+]] = trunc i32 [[SHIFT]] to i8
-// CHECK: [[AND:%[0-9]+]] = and i8 [[TRUNC]], 1
-// CHECK: ret i8 [[AND]]
diff --git a/test/CodeGen/ms-intrinsics-rotations.c b/test/CodeGen/ms-intrinsics-rotations.c
index 9533e6c3c6a2..735de6e41e68 100644
--- a/test/CodeGen/ms-intrinsics-rotations.c
+++ b/test/CodeGen/ms-intrinsics-rotations.c
@@ -30,13 +30,12 @@ unsigned char test_rotl8(unsigned char value, unsigned char shift) {
   return _rotl8(value, shift);
 }
 // CHECK: i8 @test_rotl8
-// CHECK:   [[SHIFT:%[0-9]+]] = and i8 %{{[0-9]+}}, 7
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i8 8, [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i8 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i8 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i8 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i8 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i8 [[VALUE]], i8 [[ROTATED]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i8 [[SHIFT:%[0-9]+]], 7
+// CHECK:   [[HIGH:%[0-9]+]] = shl i8 [[VALUE:%[0-9]+]], [[LSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i8 0, [[SHIFT]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i8 [[NEGATE]], 7
+// CHECK:   [[LOW:%[0-9]+]] = lshr i8 [[VALUE]], [[RSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i8 [[HIGH]], [[LOW]]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK  }
 
@@ -44,13 +43,12 @@ unsigned short test_rotl16(unsigned short value, unsigned char shift) {
   return _rotl16(value, shift);
 }
 // CHECK: i16 @test_rotl16
-// CHECK:   [[SHIFT:%[0-9]+]] = and i16 %{{[0-9]+}}, 15
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i16 16, [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i16 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i16 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i16 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i16 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i16 [[VALUE]], i16 [[ROTATED]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i16 [[SHIFT:%[0-9]+]], 15
+// CHECK:   [[HIGH:%[0-9]+]] = shl i16 [[VALUE:%[0-9]+]], [[LSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i16 0, [[SHIFT]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i16 [[NEGATE]], 15
+// CHECK:   [[LOW:%[0-9]+]] = lshr i16 [[VALUE]], [[RSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i16 [[HIGH]], [[LOW]]
 // CHECK:   ret i16 [[RESULT]]
 // CHECK  }
 
@@ -58,13 +56,12 @@ unsigned int test_rotl(unsigned int value, int shift) {
   return _rotl(value, shift);
 }
 // CHECK: i32 @test_rotl
-// CHECK:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i32 [[SHIFT:%[0-9]+]], 31
+// CHECK:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE:%[0-9]+]], [[LSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i32 0, [[SHIFT]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i32 [[NEGATE]], 31
+// CHECK:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE]], [[RSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
 // CHECK:   ret i32 [[RESULT]]
 // CHECK  }
 
@@ -72,13 +69,12 @@ unsigned LONG test_lrotl(unsigned LONG value, int shift) {
   return _lrotl(value, shift);
 }
 // CHECK-32BIT-LONG: i32 @test_lrotl
-// CHECK-32BIT-LONG:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
-// CHECK-32BIT-LONG:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
-// CHECK-32BIT-LONG:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK-32BIT-LONG:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE]], [[NEGSHIFT]]
-// CHECK-32BIT-LONG:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
-// CHECK-32BIT-LONG:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
-// CHECK-32BIT-LONG:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK-32BIT-LONG:   [[LSHIFT:%[0-9]+]] = and i32 [[SHIFT:%[0-9]+]], 31
+// CHECK-32BIT-LONG:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE:%[0-9]+]], [[LSHIFT]]
+// CHECK-32BIT-LONG:   [[NEGATE:%[0-9]+]] = sub i32 0, [[SHIFT]]
+// CHECK-32BIT-LONG:   [[RSHIFT:%[0-9]+]] = and i32 [[NEGATE]], 31
+// CHECK-32BIT-LONG:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE]], [[RSHIFT]]
+// CHECK-32BIT-LONG:   [[RESULT:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
 // CHECK-32BIT-LONG:   ret i32 [[RESULT]]
 // CHECK-32BIT-LONG  }
 
@@ -86,13 +82,12 @@ unsigned __int64 test_rotl64(unsigned __int64 value, int shift) {
   return _rotl64(value, shift);
 }
 // CHECK: i64 @test_rotl64
-// CHECK:   [[SHIFT:%[0-9]+]] = and i64 %{{[0-9]+}}, 63
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i64 64, [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i64 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i64 [[VALUE]], i64 [[ROTATED]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i64 [[SHIFT:%[0-9]+]], 63
+// CHECK:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE:%[0-9]+]], [[LSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i64 0, [[SHIFT]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i64 [[NEGATE]], 63
+// CHECK:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE]], [[RSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
 // CHECK:   ret i64 [[RESULT]]
 // CHECK  }
 
@@ -102,41 +97,36 @@ unsigned char test_rotr8(unsigned char value, unsigned char shift) {
   return _rotr8(value, shift);
 }
 // CHECK: i8 @test_rotr8
-// CHECK:   [[SHIFT:%[0-9]+]] = and i8 %{{[0-9]+}}, 7
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i8 8, [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i8 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i8 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i8 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i8 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i8 [[VALUE]], i8 [[ROTATED]]
-// CHECK:   ret i8 [[RESULT]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i8 [[SHIFT:%[0-9]+]], 7
+// CHECK:   [[LOW:%[0-9]+]] = lshr i8 [[VALUE:%[0-9]+]], [[RSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i8 0, [[SHIFT]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i8 [[NEGATE]], 7
+// CHECK:   [[HIGH:%[0-9]+]] = shl i8 [[VALUE]], [[LSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i8 [[HIGH]], [[LOW]]
 // CHECK  }
 
 unsigned short test_rotr16(unsigned short value, unsigned char shift) {
   return _rotr16(value, shift);
 }
 // CHECK: i16 @test_rotr16
-// CHECK:   [[SHIFT:%[0-9]+]] = and i16 %{{[0-9]+}}, 15
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i16 16, [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i16 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i16 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i16 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i16 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i16 [[VALUE]], i16 [[ROTATED]]
-// CHECK:   ret i16 [[RESULT]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i16 [[SHIFT:%[0-9]+]], 15
+// CHECK:   [[LOW:%[0-9]+]] = lshr i16 [[VALUE:%[0-9]+]], [[RSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i16 0, [[SHIFT]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i16 [[NEGATE]], 15
+// CHECK:   [[HIGH:%[0-9]+]] = shl i16 [[VALUE]], [[LSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i16 [[HIGH]], [[LOW]]
 // CHECK  }
 
 unsigned int test_rotr(unsigned int value, int shift) {
   return _rotr(value, shift);
 }
 // CHECK: i32 @test_rotr
-// CHECK:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i32 [[SHIFT:%[0-9]+]], 31
+// CHECK:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE:%[0-9]+]], [[RSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i32 0, [[SHIFT]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i32 [[NEGATE]], 31
+// CHECK:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE]], [[LSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
 // CHECK:   ret i32 [[RESULT]]
 // CHECK  }
 
@@ -144,13 +134,12 @@ unsigned LONG test_lrotr(unsigned LONG value, int shift) {
   return _lrotr(value, shift);
 }
 // CHECK-32BIT-LONG: i32 @test_lrotr
-// CHECK-32BIT-LONG:   [[SHIFT:%[0-9]+]] = and i32 %{{[0-9]+}}, 31
-// CHECK-32BIT-LONG:   [[NEGSHIFT:%[0-9]+]] = sub i32 32, [[SHIFT]]
-// CHECK-32BIT-LONG:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK-32BIT-LONG:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE]], [[NEGSHIFT]]
-// CHECK-32BIT-LONG:   [[ROTATED:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
-// CHECK-32BIT-LONG:   [[ISZERO:%[0-9]+]] = icmp eq i32 [[SHIFT]], 0
-// CHECK-32BIT-LONG:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i32 [[VALUE]], i32 [[ROTATED]]
+// CHECK-32BIT-LONG:   [[RSHIFT:%[0-9]+]] = and i32 [[SHIFT:%[0-9]+]], 31
+// CHECK-32BIT-LONG:   [[LOW:%[0-9]+]] = lshr i32 [[VALUE:%[0-9]+]], [[RSHIFT]]
+// CHECK-32BIT-LONG:   [[NEGATE:%[0-9]+]] = sub i32 0, [[SHIFT]]
+// CHECK-32BIT-LONG:   [[LSHIFT:%[0-9]+]] = and i32 [[NEGATE]], 31
+// CHECK-32BIT-LONG:   [[HIGH:%[0-9]+]] = shl i32 [[VALUE]], [[LSHIFT]]
+// CHECK-32BIT-LONG:   [[RESULT:%[0-9]+]] = or i32 [[HIGH]], [[LOW]]
 // CHECK-32BIT-LONG:   ret i32 [[RESULT]]
 // CHECK-32BIT-LONG  }
 
@@ -158,12 +147,11 @@ unsigned __int64 test_rotr64(unsigned __int64 value, int shift) {
   return _rotr64(value, shift);
 }
 // CHECK: i64 @test_rotr64
-// CHECK:   [[SHIFT:%[0-9]+]] = and i64 %{{[0-9]+}}, 63
-// CHECK:   [[NEGSHIFT:%[0-9]+]] = sub i64 64, [[SHIFT]]
-// CHECK:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE:%[0-9]+]], [[SHIFT]]
-// CHECK:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE]], [[NEGSHIFT]]
-// CHECK:   [[ROTATED:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
-// CHECK:   [[ISZERO:%[0-9]+]] = icmp eq i64 [[SHIFT]], 0
-// CHECK:   [[RESULT:%[0-9]+]] = select i1 [[ISZERO]], i64 [[VALUE]], i64 [[ROTATED]]
+// CHECK:   [[RSHIFT:%[0-9]+]] = and i64 [[SHIFT:%[0-9]+]], 63
+// CHECK:   [[LOW:%[0-9]+]] = lshr i64 [[VALUE:%[0-9]+]], [[RSHIFT]]
+// CHECK:   [[NEGATE:%[0-9]+]] = sub i64 0, [[SHIFT]]
+// CHECK:   [[LSHIFT:%[0-9]+]] = and i64 [[NEGATE]], 63
+// CHECK:   [[HIGH:%[0-9]+]] = shl i64 [[VALUE]], [[LSHIFT]]
+// CHECK:   [[RESULT:%[0-9]+]] = or i64 [[HIGH]], [[LOW]]
 // CHECK:   ret i64 [[RESULT]]
 // CHECK  }
diff --git a/test/CodeGen/ms-intrinsics.c b/test/CodeGen/ms-intrinsics.c
index 38cda9785029..a8c234123a93 100644
--- a/test/CodeGen/ms-intrinsics.c
+++ b/test/CodeGen/ms-intrinsics.c
@@ -7,6 +7,9 @@
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
 // RUN:         -triple x86_64--windows -Oz -emit-llvm -target-feature +cx16 %s -o - \
 // RUN:         | FileCheck %s --check-prefixes CHECK,CHECK-X64,CHECK-ARM-X64,CHECK-INTEL
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN:         -triple aarch64-windows -Oz -emit-llvm %s -o - \
+// RUN:         | FileCheck %s --check-prefix CHECK-ARM-X64
 
 // intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
@@ -20,15 +23,98 @@ void test__stosb(unsigned char *Dest, unsigned char Data, size_t Count) {
 }
 
 // CHECK-I386: define{{.*}}void @test__stosb
-// CHECK-I386:   tail call void @llvm.memset.p0i8.i32(i8* %Dest, i8 %Data, i32 %Count, i32 1, i1 true)
+// CHECK-I386:   tail call void @llvm.memset.p0i8.i32(i8* align 1 %Dest, i8 %Data, i32 %Count, i1 true)
 // CHECK-I386:   ret void
 // CHECK-I386: }
 
 // CHECK-X64: define{{.*}}void @test__stosb
-// CHECK-X64:   tail call void @llvm.memset.p0i8.i64(i8* %Dest, i8 %Data, i64 %Count, i32 1, i1 true)
+// CHECK-X64:   tail call void @llvm.memset.p0i8.i64(i8* align 1 %Dest, i8 %Data, i64 %Count, i1 true)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+
+void test__movsb(unsigned char *Dest, unsigned char *Src, size_t Count) {
+  return __movsb(Dest, Src, Count);
+}
+// CHECK-I386-LABEL: define{{.*}} void @test__movsb
+// CHECK-I386:   call { i8*, i8*, i32 } asm sideeffect "rep movsb", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %Dest, i8* %Src, i32 %Count)
+// CHECK-I386:   ret void
+// CHECK-I386: }
+
+// CHECK-X64-LABEL: define{{.*}} void @test__movsb
+// CHECK-X64:   call { i8*, i8*, i64 } asm sideeffect "rep movsb", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %Dest, i8* %Src, i64 %Count)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+
+void test__stosw(unsigned short *Dest, unsigned short Data, size_t Count) {
+  return __stosw(Dest, Data, Count);
+}
+// CHECK-I386-LABEL: define{{.*}} void @test__stosw
+// CHECK-I386:   call { i16*, i32 } asm sideeffect "rep stosw", "={di},={cx},{ax},0,1,~{memory},~{dirflag},~{fpsr},~{flags}"(i16 %Data, i16* %Dest, i32 %Count)
+// CHECK-I386:   ret void
+// CHECK-I386: }
+
+// CHECK-X64-LABEL: define{{.*}} void @test__stosw
+// CHECK-X64:   call { i16*, i64 } asm sideeffect "rep stosw", "={di},={cx},{ax},0,1,~{memory},~{dirflag},~{fpsr},~{flags}"(i16 %Data, i16* %Dest, i64 %Count)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+
+void test__movsw(unsigned short *Dest, unsigned short *Src, size_t Count) {
+  return __movsw(Dest, Src, Count);
+}
+// CHECK-I386-LABEL: define{{.*}} void @test__movsw
+// CHECK-I386:   call { i16*, i16*, i32 } asm sideeffect "rep movsw", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(i16* %Dest, i16* %Src, i32 %Count)
+// CHECK-I386:   ret void
+// CHECK-I386: }
+
+// CHECK-X64-LABEL: define{{.*}} void @test__movsw
+// CHECK-X64:   call { i16*, i16*, i64 } asm sideeffect "rep movsw", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(i16* %Dest, i16* %Src, i64 %Count)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+
+void test__stosd(unsigned long *Dest, unsigned long Data, size_t Count) {
+  return __stosd(Dest, Data, Count);
+}
+// CHECK-I386-LABEL: define{{.*}} void @test__stosd
+// CHECK-I386:   call { i32*, i32 } asm sideeffect "rep stosl", "={di},={cx},{ax},0,1,~{memory},~{dirflag},~{fpsr},~{flags}"(i32 %Data, i32* %Dest, i32 %Count)
+// CHECK-I386:   ret void
+// CHECK-I386: }
+
+// CHECK-X64-LABEL: define{{.*}} void @test__stosd
+// CHECK-X64:   call { i32*, i64 } asm sideeffect "rep stosl", "={di},={cx},{ax},0,1,~{memory},~{dirflag},~{fpsr},~{flags}"(i32 %Data, i32* %Dest, i64 %Count)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+
+void test__movsd(unsigned long *Dest, unsigned long *Src, size_t Count) {
+  return __movsd(Dest, Src, Count);
+}
+// CHECK-I386-LABEL: define{{.*}} void @test__movsd
+// CHECK-I386:   call { i32*, i32*, i32 } asm sideeffect "rep movsl", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %Dest, i32* %Src, i32 %Count)
+// CHECK-I386:   ret void
+// CHECK-I386: }
+
+// CHECK-X64-LABEL: define{{.*}} void @test__movsd
+// CHECK-X64:   call { i32*, i32*, i64 } asm sideeffect "rep movsl", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %Dest, i32* %Src, i64 %Count)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+
+#ifdef __x86_64__
+void test__stosq(unsigned __int64 *Dest, unsigned __int64 Data, size_t Count) {
+  return __stosq(Dest, Data, Count);
+}
+// CHECK-X64-LABEL: define{{.*}} void @test__stosq
+// CHECK-X64:   call { i64*, i64 } asm sideeffect "rep stosq", "={di},={cx},{ax},0,1,~{memory},~{dirflag},~{fpsr},~{flags}"(i64 %Data, i64* %Dest, i64 %Count)
 // CHECK-X64:   ret void
 // CHECK-X64: }
 
+void test__movsq(unsigned __int64 *Dest, unsigned __int64 *Src, size_t Count) {
+  return __movsq(Dest, Src, Count);
+}
+// CHECK-X64-LABEL: define{{.*}} void @test__movsq
+// CHECK-X64:   call { i64*, i64*, i64 } asm sideeffect "rep movsq", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %Dest, i64* %Src, i64 %Count)
+// CHECK-X64:   ret void
+// CHECK-X64: }
+#endif
+
 void test__ud2(void) {
   __ud2();
 }
@@ -55,7 +141,7 @@ void *test_ReturnAddress() {
 void *test_AddressOfReturnAddress() {
   return _AddressOfReturnAddress();
 }
-// CHECK-INTEL-LABEL: define i8* @test_AddressOfReturnAddress()
+// CHECK-INTEL-LABEL: define dso_local i8* @test_AddressOfReturnAddress()
 // CHECK-INTEL: = tail call i8* @llvm.addressofreturnaddress()
 // CHECK-INTEL: ret i8*
 #endif
@@ -89,7 +175,7 @@ unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) {
 // CHECK:   store i32 [[INDEX]], i32* %Index, align 4
 // CHECK:   br label %[[END_LABEL]]
 
-#if defined(__x86_64__) || defined(__arm__)
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
 unsigned char test_BitScanForward64(unsigned long *Index, unsigned __int64 Mask) {
   return _BitScanForward64(Index, Mask);
 }
@@ -386,7 +472,7 @@ long test_InterlockedDecrement(long volatile *Addend) {
 // CHECK: ret i32 [[RESULT]]
 // CHECK: }
 
-#if defined(__x86_64__) || defined(__arm__)
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
 __int64 test_InterlockedExchange64(__int64 volatile *value, __int64 mask) {
   return _InterlockedExchange64(value, mask);
 }
@@ -455,17 +541,56 @@ __int64 test_InterlockedDecrement64(__int64 volatile *Addend) {
 
 #endif
 
-unsigned char test_interlockedbittestandset(volatile long *ptr, long bit) {
-  return _interlockedbittestandset(ptr, bit);
+#if defined(__i386__) || defined(__x86_64__)
+long test_InterlockedExchange_HLEAcquire(long volatile *Target, long Value) {
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedExchange_HLEAcquire(i32*{{[a-z_ ]*}}%Target, i32{{[a-z_ ]*}}%Value)
+// CHECK-INTEL: call i32 asm sideeffect ".byte 0xf2 ; lock ; xchg $0, $1", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %Target, i32 %Value, i32* %Target)
+  return _InterlockedExchange_HLEAcquire(Target, Value);
+}
+long test_InterlockedExchange_HLERelease(long volatile *Target, long Value) {
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedExchange_HLERelease(i32*{{[a-z_ ]*}}%Target, i32{{[a-z_ ]*}}%Value)
+// CHECK-INTEL: call i32 asm sideeffect ".byte 0xf3 ; lock ; xchg $0, $1", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %Target, i32 %Value, i32* %Target)
+  return _InterlockedExchange_HLERelease(Target, Value);
+}
+long test_InterlockedCompareExchange_HLEAcquire(long volatile *Destination,
+                                                long Exchange, long Comparand) {
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedCompareExchange_HLEAcquire(i32*{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comparand)
+// CHECK-INTEL: call i32 asm sideeffect ".byte 0xf2 ; lock ; cmpxchg $2, $1", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %Destination, i32 %Exchange, i32 %Comparand, i32* %Destination)
+  return _InterlockedCompareExchange_HLEAcquire(Destination, Exchange, Comparand);
+}
+long test_InterlockedCompareExchange_HLERelease(long volatile *Destination,
+                                            long Exchange, long Comparand) {
+// CHECK-INTEL: define{{.*}} i32 @test_InterlockedCompareExchange_HLERelease(i32*{{[a-z_ ]*}}%Destination, i32{{[a-z_ ]*}}%Exchange, i32{{[a-z_ ]*}}%Comparand)
+// CHECK-INTEL: call i32 asm sideeffect ".byte 0xf3 ; lock ; cmpxchg $2, $1", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %Destination, i32 %Exchange, i32 %Comparand, i32* %Destination)
+  return _InterlockedCompareExchange_HLERelease(Destination, Exchange, Comparand);
 }
-// CHECK-LABEL: define{{.*}} i8 @test_interlockedbittestandset
-// CHECK: [[MASKBIT:%[0-9]+]] = shl i32 1, %bit
-// CHECK: [[OLD:%[0-9]+]] = atomicrmw or i32* %ptr, i32 [[MASKBIT]] seq_cst
-// CHECK: [[SHIFT:%[0-9]+]] = lshr i32 [[OLD]], %bit
-// CHECK: [[TRUNC:%[0-9]+]] = trunc i32 [[SHIFT]] to i8
-// CHECK: [[AND:%[0-9]+]] = and i8 [[TRUNC]], 1
-// CHECK: ret i8 [[AND]]
+#endif
+#if defined(__x86_64__)
+__int64 test_InterlockedExchange64_HLEAcquire(__int64 volatile *Target, __int64 Value) {
+// CHECK-X64: define{{.*}} i64 @test_InterlockedExchange64_HLEAcquire(i64*{{[a-z_ ]*}}%Target, i64{{[a-z_ ]*}}%Value)
+// CHECK-X64: call i64 asm sideeffect ".byte 0xf2 ; lock ; xchg $0, $1", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %Target, i64 %Value, i64* %Target)
+  return _InterlockedExchange64_HLEAcquire(Target, Value);
+}
+__int64 test_InterlockedExchange64_HLERelease(__int64 volatile *Target, __int64 Value) {
+// CHECK-X64: define{{.*}} i64 @test_InterlockedExchange64_HLERelease(i64*{{[a-z_ ]*}}%Target, i64{{[a-z_ ]*}}%Value)
+// CHECK-X64: call i64 asm sideeffect ".byte 0xf3 ; lock ; xchg $0, $1", "=r,=*m,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %Target, i64 %Value, i64* %Target)
+  return _InterlockedExchange64_HLERelease(Target, Value);
+}
+__int64 test_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *Destination,
+                                                     __int64 Exchange, __int64 Comparand) {
+// CHECK-X64: define{{.*}} i64 @test_InterlockedCompareExchange64_HLEAcquire(i64*{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comparand)
+// CHECK-X64: call i64 asm sideeffect ".byte 0xf2 ; lock ; cmpxchg $2, $1", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %Destination, i64 %Exchange, i64 %Comparand, i64* %Destination)
+  return _InterlockedCompareExchange64_HLEAcquire(Destination, Exchange, Comparand);
+}
+__int64 test_InterlockedCompareExchange64_HLERelease(__int64 volatile *Destination,
+                                                     __int64 Exchange, __int64 Comparand) {
+// CHECK-X64: define{{.*}} i64 @test_InterlockedCompareExchange64_HLERelease(i64*{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%Exchange, i64{{[a-z_ ]*}}%Comparand)
+// CHECK-X64: call i64 asm sideeffect ".byte 0xf3 ; lock ; cmpxchg $2, $1", "={ax},=*m,r,0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %Destination, i64 %Exchange, i64 %Comparand, i64* %Destination)
+  return _InterlockedCompareExchange64_HLERelease(Destination, Exchange, Comparand);
+}
+#endif
 
+#if !defined(__aarch64__)
 void test__fastfail() {
   __fastfail(42);
 }
@@ -476,4 +601,4 @@ void test__fastfail() {
 // Attributes come last.
 
 // CHECK: attributes #[[NORETURN]] = { noreturn{{.*}} }
-
+#endif
diff --git a/test/CodeGen/ms-setjmp.c b/test/CodeGen/ms-setjmp.c
index 675c8dda27de..d92f4d00ff5c 100644
--- a/test/CodeGen/ms-setjmp.c
+++ b/test/CodeGen/ms-setjmp.c
@@ -1,7 +1,9 @@
 // RUN: %clang_cc1 -fms-extensions -DDECLARE_SETJMP -triple i686-windows-msvc   -emit-llvm %s -o - | FileCheck --check-prefix=I386 %s
 // RUN: %clang_cc1 -fms-extensions -DDECLARE_SETJMP -triple x86_64-windows-msvc -emit-llvm %s -o - | FileCheck --check-prefix=X64 %s
+// RUN: %clang_cc1 -fms-extensions -DDECLARE_SETJMP -triple aarch64-windows-msvc -emit-llvm %s -o - | FileCheck --check-prefix=AARCH64 %s
 // RUN: %clang_cc1 -fms-extensions -triple i686-windows-msvc   -emit-llvm %s -o - | FileCheck --check-prefix=I386 %s
 // RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc -emit-llvm %s -o - | FileCheck --check-prefix=X64 %s
+// RUN: %clang_cc1 -fms-extensions -triple aarch64-windows-msvc -emit-llvm %s -o - | FileCheck --check-prefix=AARCH64 %s
 typedef char jmp_buf[1];
 
 #ifdef DECLARE_SETJMP
@@ -13,20 +15,30 @@ jmp_buf jb;
 
 int test_setjmp() {
   return _setjmp(jb);
-  // I386-LABEL: define i32 @test_setjmp
+  // I386-LABEL: define dso_local i32 @test_setjmp
   // I386:       %[[call:.*]] = call i32 (i8*, i32, ...) @_setjmp3(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @jb, i32 0, i32 0), i32 0)
   // I386-NEXT:  ret i32 %[[call]]
 
-  // X64-LABEL: define i32 @test_setjmp
+  // X64-LABEL: define dso_local i32 @test_setjmp
   // X64:       %[[addr:.*]] = call i8* @llvm.frameaddress(i32 0)
   // X64:       %[[call:.*]] = call i32 @_setjmp(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @jb, i32 0, i32 0), i8* %[[addr]])
   // X64-NEXT:  ret i32 %[[call]]
+
+  // AARCH64-LABEL: define dso_local i32 @test_setjmp
+  // AARCH64:       %[[addr:.*]] = call i8* @llvm.frameaddress(i32 0)
+  // AARCH64:       %[[call:.*]] = call i32 @_setjmpex(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @jb, i32 0, i32 0), i8* %[[addr]])
+  // AARCH64-NEXT:  ret i32 %[[call]]
 }
 
 int test_setjmpex() {
   return _setjmpex(jb);
-  // X64-LABEL: define i32 @test_setjmpex
+  // X64-LABEL: define dso_local i32 @test_setjmpex
   // X64:       %[[addr:.*]] = call i8* @llvm.frameaddress(i32 0)
   // X64:       %[[call:.*]] = call i32 @_setjmpex(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @jb, i32 0, i32 0), i8* %[[addr]])
   // X64-NEXT:  ret i32 %[[call]]
+
+  // AARCH64-LABEL: define dso_local i32 @test_setjmpex
+  // AARCH64:       %[[addr:.*]] = call i8* @llvm.frameaddress(i32 0)
+  // AARCH64:       %[[call:.*]] = call i32 @_setjmpex(i8* getelementptr inbounds ([1 x i8], [1 x i8]* @jb, i32 0, i32 0), i8* %[[addr]])
+  // AARCH64-NEXT:  ret i32 %[[call]]
 }
diff --git a/test/CodeGen/ms-volatile-aarch64.c b/test/CodeGen/ms-volatile-aarch64.c
new file mode 100644
index 000000000000..2a139f5139fa
--- /dev/null
+++ b/test/CodeGen/ms-volatile-aarch64.c
@@ -0,0 +1,13 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-win32 -emit-llvm -fms-extensions -fms-volatile -o - < %s | FileCheck %s
+
+void test1(int volatile *p, int v) {
+  __iso_volatile_store32(p, v);
+  // CHECK-LABEL: @test1
+  // CHECK: store volatile {{.*}}, {{.*}}
+}
+int test2(const int volatile *p) {
+  return __iso_volatile_load32(p);
+  // CHECK-LABEL: @test2
+  // CHECK: load volatile {{.*}}
+}
diff --git a/test/CodeGen/ms-x86-intrinsics.c b/test/CodeGen/ms-x86-intrinsics.c
index 51520d1f658b..450a134131be 100644
--- a/test/CodeGen/ms-x86-intrinsics.c
+++ b/test/CodeGen/ms-x86-intrinsics.c
@@ -9,7 +9,7 @@
 char test__readfsbyte(unsigned long Offset) {
   return __readfsbyte(Offset);
 }
-// CHECK-I386-LABEL: define signext i8 @test__readfsbyte(i32 %Offset)
+// CHECK-I386-LABEL: define dso_local signext i8 @test__readfsbyte(i32 %Offset)
 // CHECK-I386:   [[PTR:%[0-9]+]] = inttoptr i32 %Offset to i8 addrspace(257)*
 // CHECK-I386:   [[VALUE:%[0-9]+]] = load volatile i8, i8 addrspace(257)* [[PTR]], align 1
 // CHECK-I386:   ret i8 [[VALUE:%[0-9]+]]
@@ -17,7 +17,7 @@ char test__readfsbyte(unsigned long Offset) {
 short test__readfsword(unsigned long Offset) {
   return __readfsword(Offset);
 }
-// CHECK-I386-LABEL: define signext i16 @test__readfsword(i32 %Offset)
+// CHECK-I386-LABEL: define dso_local signext i16 @test__readfsword(i32 %Offset)
 // CHECK-I386:   [[PTR:%[0-9]+]] = inttoptr i32 %Offset to i16 addrspace(257)*
 // CHECK-I386:   [[VALUE:%[0-9]+]] = load volatile i16, i16 addrspace(257)* [[PTR]], align 2
 // CHECK-I386:   ret i16 [[VALUE:%[0-9]+]]
@@ -25,7 +25,7 @@ short test__readfsword(unsigned long Offset) {
 long test__readfsdword(unsigned long Offset) {
   return __readfsdword(Offset);
 }
-// CHECK-I386-LABEL: define i32 @test__readfsdword(i32 %Offset)
+// CHECK-I386-LABEL: define dso_local i32 @test__readfsdword(i32 %Offset)
 // CHECK-I386:   [[PTR:%[0-9]+]] = inttoptr i32 %Offset to i32 addrspace(257)*
 // CHECK-I386:   [[VALUE:%[0-9]+]] = load volatile i32, i32 addrspace(257)* [[PTR]], align 4
 // CHECK-I386:   ret i32 [[VALUE:%[0-9]+]]
@@ -33,7 +33,7 @@ long test__readfsdword(unsigned long Offset) {
 long long test__readfsqword(unsigned long Offset) {
   return __readfsqword(Offset);
 }
-// CHECK-I386-LABEL: define i64 @test__readfsqword(i32 %Offset)
+// CHECK-I386-LABEL: define dso_local i64 @test__readfsqword(i32 %Offset)
 // CHECK-I386:   [[PTR:%[0-9]+]] = inttoptr i32 %Offset to i64 addrspace(257)*
 // CHECK-I386:   [[VALUE:%[0-9]+]] = load volatile i64, i64 addrspace(257)* [[PTR]], align 8
 // CHECK-I386:   ret i64 [[VALUE:%[0-9]+]]
@@ -42,7 +42,7 @@ long long test__readfsqword(unsigned long Offset) {
 __int64 test__emul(int a, int b) {
   return __emul(a, b);
 }
-// CHECK-LABEL: define i64 @test__emul(i32 %a, i32 %b)
+// CHECK-LABEL: define dso_local i64 @test__emul(i32 %a, i32 %b)
 // CHECK: [[X:%[0-9]+]] = sext i32 %a to i64
 // CHECK: [[Y:%[0-9]+]] = sext i32 %b to i64
 // CHECK: [[RES:%[0-9]+]] = mul nsw i64 [[Y]], [[X]]
@@ -51,7 +51,7 @@ __int64 test__emul(int a, int b) {
 unsigned __int64 test__emulu(unsigned int a, unsigned int b) {
   return __emulu(a, b);
 }
-// CHECK-LABEL: define i64 @test__emulu(i32 %a, i32 %b)
+// CHECK-LABEL: define dso_local i64 @test__emulu(i32 %a, i32 %b)
 // CHECK: [[X:%[0-9]+]] = zext i32 %a to i64
 // CHECK: [[Y:%[0-9]+]] = zext i32 %b to i64
 // CHECK: [[RES:%[0-9]+]] = mul nuw i64 [[Y]], [[X]]
@@ -62,7 +62,7 @@ unsigned __int64 test__emulu(unsigned int a, unsigned int b) {
 char test__readgsbyte(unsigned long Offset) {
   return __readgsbyte(Offset);
 }
-// CHECK-X64-LABEL: define i8 @test__readgsbyte(i32 %Offset)
+// CHECK-X64-LABEL: define dso_local i8 @test__readgsbyte(i32 %Offset)
 // CHECK-X64:   [[ZEXT:%[0-9]+]] = zext i32 %Offset to i64
 // CHECK-X64:   [[PTR:%[0-9]+]] = inttoptr i64 [[ZEXT]] to i8 addrspace(256)*
 // CHECK-X64:   [[VALUE:%[0-9]+]] = load volatile i8, i8 addrspace(256)* [[PTR]], align 1
@@ -71,7 +71,7 @@ char test__readgsbyte(unsigned long Offset) {
 short test__readgsword(unsigned long Offset) {
   return __readgsword(Offset);
 }
-// CHECK-X64-LABEL: define i16 @test__readgsword(i32 %Offset)
+// CHECK-X64-LABEL: define dso_local i16 @test__readgsword(i32 %Offset)
 // CHECK-X64:   [[ZEXT:%[0-9]+]] = zext i32 %Offset to i64
 // CHECK-X64:   [[PTR:%[0-9]+]] = inttoptr i64 [[ZEXT]] to i16 addrspace(256)*
 // CHECK-X64:   [[VALUE:%[0-9]+]] = load volatile i16, i16 addrspace(256)* [[PTR]], align 2
@@ -80,7 +80,7 @@ short test__readgsword(unsigned long Offset) {
 long test__readgsdword(unsigned long Offset) {
   return __readgsdword(Offset);
 }
-// CHECK-X64-LABEL: define i32 @test__readgsdword(i32 %Offset)
+// CHECK-X64-LABEL: define dso_local i32 @test__readgsdword(i32 %Offset)
 // CHECK-X64:   [[ZEXT:%[0-9]+]] = zext i32 %Offset to i64
 // CHECK-X64:   [[PTR:%[0-9]+]] = inttoptr i64 [[ZEXT]] to i32 addrspace(256)*
 // CHECK-X64:   [[VALUE:%[0-9]+]] = load volatile i32, i32 addrspace(256)* [[PTR]], align 4
@@ -89,7 +89,7 @@ long test__readgsdword(unsigned long Offset) {
 long long test__readgsqword(unsigned long Offset) {
   return __readgsqword(Offset);
 }
-// CHECK-X64-LABEL: define i64 @test__readgsqword(i32 %Offset)
+// CHECK-X64-LABEL: define dso_local i64 @test__readgsqword(i32 %Offset)
 // CHECK-X64:   [[ZEXT:%[0-9]+]] = zext i32 %Offset to i64
 // CHECK-X64:   [[PTR:%[0-9]+]] = inttoptr i64 [[ZEXT]] to i64 addrspace(256)*
 // CHECK-X64:   [[VALUE:%[0-9]+]] = load volatile i64, i64 addrspace(256)* [[PTR]], align 8
@@ -98,13 +98,13 @@ long long test__readgsqword(unsigned long Offset) {
 __int64 test__mulh(__int64 a, __int64 b) {
   return __mulh(a, b);
 }
-// CHECK-X64-LABEL: define i64 @test__mulh(i64 %a, i64 %b)
+// CHECK-X64-LABEL: define dso_local i64 @test__mulh(i64 %a, i64 %b)
 // CHECK-X64: = mul nsw i128 %
 
 unsigned __int64 test__umulh(unsigned __int64 a, unsigned __int64 b) {
   return __umulh(a, b);
 }
-// CHECK-X64-LABEL: define i64 @test__umulh(i64 %a, i64 %b)
+// CHECK-X64-LABEL: define dso_local i64 @test__umulh(i64 %a, i64 %b)
 // CHECK-X64: = mul nuw i128 %
 
 __int64 test_mul128(__int64 Multiplier,
@@ -112,7 +112,7 @@ __int64 test_mul128(__int64 Multiplier,
                     __int64 *HighProduct) {
   return _mul128(Multiplier, Multiplicand, HighProduct);
 }
-// CHECK-X64-LABEL: define i64 @test_mul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct)
+// CHECK-X64-LABEL: define dso_local i64 @test_mul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct)
 // CHECK-X64: = sext i64 %Multiplier to i128
 // CHECK-X64: = sext i64 %Multiplicand to i128
 // CHECK-X64: = mul nsw i128 %
@@ -124,7 +124,7 @@ unsigned __int64 test_umul128(unsigned __int64 Multiplier,
                               unsigned __int64 *HighProduct) {
   return _umul128(Multiplier, Multiplicand, HighProduct);
 }
-// CHECK-X64-LABEL: define i64 @test_umul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct)
+// CHECK-X64-LABEL: define dso_local i64 @test_umul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct)
 // CHECK-X64: = zext i64 %Multiplier to i128
 // CHECK-X64: = zext i64 %Multiplicand to i128
 // CHECK-X64: = mul nuw i128 %
diff --git a/test/CodeGen/ms_abi.c b/test/CodeGen/ms_abi.c
index 407087e40916..75e1caf922df 100644
--- a/test/CodeGen/ms_abi.c
+++ b/test/CodeGen/ms_abi.c
@@ -13,7 +13,7 @@ void __attribute__((ms_abi)) f1(void);
 void __attribute__((sysv_abi)) f2(void);
 void f3(void) {
   // FREEBSD-LABEL: define void @f3()
-  // WIN64-LABEL: define void @f3()
+  // WIN64-LABEL: define dso_local void @f3()
   f1();
   // FREEBSD: call win64cc void @f1()
   // WIN64: call void @f1()
@@ -23,13 +23,13 @@ void f3(void) {
 }
 // FREEBSD: declare win64cc void @f1()
 // FREEBSD: declare void @f2()
-// WIN64: declare void @f1()
-// WIN64: declare x86_64_sysvcc void @f2()
+// WIN64: declare dso_local void @f1()
+// WIN64: declare dso_local x86_64_sysvcc void @f2()
 
 // Win64 ABI varargs
 void __attribute__((ms_abi)) f4(int a, ...) {
   // FREEBSD-LABEL: define win64cc void @f4
-  // WIN64-LABEL: define void @f4
+  // WIN64-LABEL: define dso_local void @f4
   __builtin_ms_va_list ap;
   __builtin_ms_va_start(ap, a);
   // FREEBSD: %[[AP:.*]] = alloca i8*
@@ -79,7 +79,7 @@ void __attribute__((ms_abi)) f4(int a, ...) {
 
 // Let's verify that normal va_lists work right on Win64, too.
 void f5(int a, ...) {
-  // WIN64-LABEL: define void @f5
+  // WIN64-LABEL: define dso_local void @f5
   __builtin_va_list ap;
   __builtin_va_start(ap, a);
   // WIN64: %[[AP:.*]] = alloca i8*
@@ -110,7 +110,7 @@ void f5(int a, ...) {
 void __attribute__((sysv_abi)) f6(__builtin_ms_va_list ap) {
   // FREEBSD-LABEL: define void @f6
   // FREEBSD: store i8* %ap, i8** %[[AP:.*]]
-  // WIN64-LABEL: define x86_64_sysvcc void @f6
+  // WIN64-LABEL: define dso_local x86_64_sysvcc void @f6
   // WIN64: store i8* %ap, i8** %[[AP:.*]]
   int b = __builtin_va_arg(ap, int);
   // FREEBSD: %[[AP_CUR:.*]] = load i8*, i8** %[[AP]]
@@ -146,3 +146,16 @@ void __attribute__((sysv_abi)) f6(__builtin_ms_va_list ap) {
   // WIN64: %[[AP_VAL:.*]] = load i8*, i8** %[[AP]]
   // WIN64-NEXT: store i8* %[[AP_VAL]], i8** %[[AP2:.*]]
 }
+
+// This test checks if structs are passed according to Win64 calling convention
+// when it's enforced by __attribute((ms_abi)).
+struct i128 {
+  unsigned long long a;
+  unsigned long long b;
+};
+
+__attribute__((ms_abi)) struct i128 f7(struct i128 a) {
+  // WIN64: define dso_local void @f7(%struct.i128* noalias sret %agg.result, %struct.i128* %a)
+  // FREEBSD: define win64cc void @f7(%struct.i128* noalias sret %agg.result, %struct.i128* %a)
+  return a;
+}
diff --git a/test/CodeGen/ms_abi_aarch64.c b/test/CodeGen/ms_abi_aarch64.c
index 4aa93f00859a..5b6cd6a208b6 100644
--- a/test/CodeGen/ms_abi_aarch64.c
+++ b/test/CodeGen/ms_abi_aarch64.c
@@ -5,7 +5,7 @@ void __attribute__((ms_abi)) f1(void);
 void f2(void);
 void f3(void) {
   // LINUX-LABEL: define void @f3()
-  // WIN64-LABEL: define void @f3()
+  // WIN64-LABEL: define dso_local void @f3()
   f1();
   // LINUX: call win64cc void @f1()
   // WIN64: call void @f1()
@@ -15,13 +15,13 @@ void f3(void) {
 }
 // LINUX: declare win64cc void @f1()
 // LINUX: declare void @f2()
-// WIN64: declare void @f1()
-// WIN64: declare void @f2()
+// WIN64: declare dso_local void @f1()
+// WIN64: declare dso_local void @f2()
 
 // Win64 ABI varargs
 void __attribute__((ms_abi)) f4(int a, ...) {
   // LINUX-LABEL: define win64cc void @f4
-  // WIN64-LABEL: define void @f4
+  // WIN64-LABEL: define dso_local void @f4
   __builtin_ms_va_list ap;
   __builtin_ms_va_start(ap, a);
   // LINUX: %[[AP:.*]] = alloca i8*
@@ -50,7 +50,7 @@ void __attribute__((ms_abi)) f4(int a, ...) {
 
 // Let's verify that normal va_lists work right on Win64, too.
 void f5(int a, ...) {
-  // WIN64-LABEL: define void @f5
+  // WIN64-LABEL: define dso_local void @f5
   __builtin_va_list ap;
   __builtin_va_start(ap, a);
   // WIN64: %[[AP:.*]] = alloca i8*
diff --git a/test/CodeGen/ms_struct-long-double.c b/test/CodeGen/ms_struct-long-double.c
new file mode 100644
index 000000000000..9b3ea7947a03
--- /dev/null
+++ b/test/CodeGen/ms_struct-long-double.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -emit-llvm-only -triple i686-windows-gnu -fdump-record-layouts %s | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm-only -triple i686-linux -fdump-record-layouts -Wno-incompatible-ms-struct %s | FileCheck %s
+// RUN: not %clang_cc1 -emit-llvm-only -triple i686-linux -fdump-record-layouts %s 2>&1 | FileCheck %s -check-prefix=ERROR
+
+struct ldb_struct {
+  char c;
+  long double ldb;
+} __attribute__((__ms_struct__));
+
+struct ldb_struct a;
+
+// CHECK:             0 | struct ldb_struct
+// CHECK-NEXT:        0 |   char c
+// CHECK-NEXT:        4 |   long double ldb
+// CHECK-NEXT:          | [sizeof=16, align=4]
+
+// ERROR: error: ms_struct may not produce Microsoft-compatible layouts with fundamental data types with sizes that aren't a power of two
diff --git a/test/CodeGen/ms_this.cpp b/test/CodeGen/ms_this.cpp
index 8647a5bc8b8b..f0da36f447ec 100644
--- a/test/CodeGen/ms_this.cpp
+++ b/test/CodeGen/ms_this.cpp
@@ -13,7 +13,7 @@ public:
   void runc();
 };
 
-// CHECK: define void @"\01?runc@t2@@
+// CHECK: define dso_local void @"?runc@t2@@
 void t2::runc() {
   double num = 0;
   __asm {
@@ -26,7 +26,7 @@ void t2::runc() {
 	   };
 }
 
-// CHECK: define void @"\01?runc@t1@@
+// CHECK: define dso_local void @"?runc@t1@@
 void t1::runc() {
   double num = 0;
   __asm {
@@ -41,7 +41,7 @@ void t1::runc() {
 
 struct s {
   int a;
-  // CHECK: define linkonce_odr void @"\01?func@s@@
+  // CHECK: define linkonce_odr dso_local void @"?func@s@@
   void func() {
     __asm mov rax, [this]
     // CHECK: [[THIS_ADDR_S:%.+]] = alloca %struct.s*
diff --git a/test/CodeGen/no-bitfield-type-align.c b/test/CodeGen/no-bitfield-type-align.c
new file mode 100644
index 000000000000..830cd585c6ba
--- /dev/null
+++ b/test/CodeGen/no-bitfield-type-align.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fno-bitfield-type-align -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: %[[STRUCT_S:.*]] = type { i32 }
+
+struct S {
+  unsigned short:   0;
+  unsigned short  f1:15;
+  unsigned short:   0;
+  unsigned short  f2:15;
+};
+
+// CHECK: define void @test_zero_width_bitfield(%[[STRUCT_S]]* %[[A:.*]])
+// CHECK: %[[BF_LOAD:.*]] = load i32, i32* %[[V1:.*]], align 1
+// CHECK: %[[BF_CLEAR:.*]] = and i32 %[[BF_LOAD]], 32767
+// CHECK: %[[BF_CAST:.*]] = trunc i32 %[[BF_CLEAR]] to i16
+// CHECK: %[[CONV:.*]] = zext i16 %[[BF_CAST]] to i32
+// CHECK: %[[ADD:.*]] = add nsw i32 %[[CONV]], 1
+// CHECK: %[[CONV1:.*]] = trunc i32 %[[ADD]] to i16
+// CHECK: %[[V2:.*]] = zext i16 %[[CONV1]] to i32
+// CHECK: %[[BF_LOAD2:.*]] = load i32, i32* %[[V1]], align 1
+// CHECK: %[[BF_VALUE:.*]] = and i32 %[[V2]], 32767
+// CHECK: %[[BF_CLEAR3:.*]] = and i32 %[[BF_LOAD2]], -32768
+// CHECK: %[[BF_SET:.*]] = or i32 %[[BF_CLEAR3]], %[[BF_VALUE]]
+// CHECK: store i32 %[[BF_SET]], i32* %[[V1]], align 1
+
+// CHECK: %[[BF_LOAD4:.*]] = load i32, i32* %[[V4:.*]], align 1
+// CHECK: %[[BF_LSHR:.*]] = lshr i32 %[[BF_LOAD4]], 15
+// CHECK: %[[BF_CLEAR5:.*]] = and i32 %[[BF_LSHR]], 32767
+// CHECK: %[[BF_CAST6:.*]] = trunc i32 %[[BF_CLEAR5]] to i16
+// CHECK: %[[CONV7:.*]] = zext i16 %[[BF_CAST6]] to i32
+// CHECK: %[[ADD8:.*]] = add nsw i32 %[[CONV7]], 2
+// CHECK: %[[CONV9:.*]] = trunc i32 %[[ADD8]] to i16
+// CHECK: %[[V5:.*]] = zext i16 %[[CONV9]] to i32
+// CHECK: %[[BF_LOAD10:.*]] = load i32, i32* %[[V4]], align 1
+// CHECK: %[[BF_VALUE11:.*]] = and i32 %[[V5]], 32767
+// CHECK: %[[BF_SHL:.*]] = shl i32 %[[BF_VALUE11]], 15
+// CHECK: %[[BF_CLEAR12:.*]] = and i32 %[[BF_LOAD10]], -1073709057
+// CHECK: %[[BF_SET13:.*]] = or i32 %[[BF_CLEAR12]], %[[BF_SHL]]
+// CHECK: store i32 %[[BF_SET13]], i32* %[[V4]], align 1
+
+void test_zero_width_bitfield(struct S *a) {
+  a->f1 += 1;
+  a->f2 += 2;
+}
diff --git a/test/CodeGen/no-common.c b/test/CodeGen/no-common.c
index 8d2c4d7d74c8..f3a6a334e6f5 100644
--- a/test/CodeGen/no-common.c
+++ b/test/CodeGen/no-common.c
@@ -1,15 +1,15 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK-DEFAULT
 // RUN: %clang_cc1 %s -fno-common -emit-llvm -o - | FileCheck %s -check-prefix=CHECK-NOCOMMON
 
-// CHECK-DEFAULT: @x = common global
-// CHECK-NOCOMMON: @x = global
+// CHECK-DEFAULT: @x = common {{(dso_local )?}}global
+// CHECK-NOCOMMON: @x = {{(dso_local )?}}global
 int x;
 
-// CHECK-DEFAULT: @ABC = global
-// CHECK-NOCOMMON: @ABC = global
+// CHECK-DEFAULT: @ABC = {{(dso_local )?}}global
+// CHECK-NOCOMMON: @ABC = {{(dso_local )?}}global
 typedef void* (*fn_t)(long a, long b, char *f, int c);
 fn_t ABC __attribute__ ((nocommon));
 
-// CHECK-DEFAULT: @y = common global
-// CHECK-NOCOMMON: @y = common global
+// CHECK-DEFAULT: @y = common {{(dso_local )?}}global
+// CHECK-NOCOMMON: @y = common {{(dso_local )?}}global
 int y __attribute__((common));
diff --git a/test/CodeGen/no-ident-version.c b/test/CodeGen/no-ident-version.c
new file mode 100644
index 000000000000..3765a605a988
--- /dev/null
+++ b/test/CodeGen/no-ident-version.c
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -o - %s \
+// RUN:   | FileCheck --check-prefix=CHECK-NONE %s
+// RUN: %clang_cc1 -Qn -emit-llvm -debug-info-kind=limited -o - %s \
+// RUN:   | FileCheck --check-prefix=CHECK-QN %s
+// RUN: %clang_cc1 -fno-ident -emit-llvm -debug-info-kind=limited -o - %s \
+// RUN:   | FileCheck --check-prefix=CHECK-QN %s
+// RUN: %clang_cc1 -Qy -emit-llvm -debug-info-kind=limited -o - %s \
+// RUN:   | FileCheck --check-prefix=CHECK-QY %s
+// RUN: %clang_cc1 -fident -emit-llvm -debug-info-kind=limited -o - %s \
+// RUN:   | FileCheck --check-prefix=CHECK-QY %s
+
+// CHECK-NONE: @main
+// CHECK-NONE: llvm.ident
+// CHECK-NONE: producer:
+
+// CHECK-QN: @main
+// CHECK-QN-NOT: llvm.ident
+// CHECK-QN-NOT: producer:
+
+// CHECK-QY: @main
+// CHECK-QY: llvm.ident
+// CHECK-QY: producer:
+int main(void) {}
diff --git a/test/CodeGen/no-junk-ftrunc.c b/test/CodeGen/no-junk-ftrunc.c
new file mode 100644
index 000000000000..a645d9688669
--- /dev/null
+++ b/test/CodeGen/no-junk-ftrunc.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -S -fno-strict-float-cast-overflow %s -emit-llvm -o - | FileCheck %s --check-prefix=NOSTRICT
+// NOSTRICT-LABEL: main
+// NOSTRICT: attributes #0 = {{.*}}"strict-float-cast-overflow"="false"{{.*}}
+
+// The workaround attribute is not applied by default.
+
+// RUN: %clang_cc1 -S %s -fstrict-float-cast-overflow -emit-llvm -o - | FileCheck %s --check-prefix=STRICT
+// STRICT-LABEL: main
+// STRICT-NOT: strict-float-cast-overflow
+
+// RUN: %clang_cc1 -S %s -emit-llvm -o - | FileCheck %s --check-prefix=DEFAULT
+// DEFAULT-LABEL: main
+// DEFAULT-NOT: strict-float-cast-overflow
+
+int main() {
+  return 0;
+}
+
diff --git a/test/CodeGen/no-opt-volatile-memcpy.c b/test/CodeGen/no-opt-volatile-memcpy.c
index bf98df39af16..2781475b6302 100644
--- a/test/CodeGen/no-opt-volatile-memcpy.c
+++ b/test/CodeGen/no-opt-volatile-memcpy.c
@@ -18,10 +18,10 @@ void foo (void) {
 // CHECK: %[[LS:.*]] = alloca %struct.s, align 4
 // CHECK-NEXT: %[[ZERO:.*]] = bitcast %struct.s* %[[LS]] to i8*
 // CHECK-NEXT: %[[ONE:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[ZERO]], i8* %[[ONE]], i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[ZERO]], i8* align 4 %[[ONE]], i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 // CHECK-NEXT: %[[TWO:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[TWO]], i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[TWO]], i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 
 
 struct s1 {
@@ -35,6 +35,6 @@ void fee (void) {
   s.y = gs;
 }
 // CHECK-LABEL: define void @fee()
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
 
diff --git a/test/CodeGen/no-prototype.c b/test/CodeGen/no-prototype.c
new file mode 100644
index 000000000000..6030357b70f7
--- /dev/null
+++ b/test/CodeGen/no-prototype.c
@@ -0,0 +1,20 @@
+// REQUIRES: webassembly-registered-target
+// RUN: %clang_cc1 -triple wasm32 -emit-llvm %s -o - | FileCheck %s
+
+int foo();
+
+int bar(int a) {
+  return foo();
+}
+
+int baz() {
+  return foo();
+}
+
+// CHECK: define i32 @bar(i32 %a) [[BAR_ATTR:#[0-9]+]] {
+// CHECK: declare i32 @foo(...) [[FOO_ATTR:#[0-9]+]]
+// CHECK: define i32 @baz() [[BAZ_ATTR:#[0-9]+]] {
+
+// CHECK: attributes [[FOO_ATTR]] = {  {{.*}}"no-prototype"{{.*}} }
+// CHECK-NOT: attributes [[BAR_ATTR]] = {  {{.*}}"no-prototype"{{.*}} }
+// CHECK-NOT: attributes [[BAZ_ATTR]] = {  {{.*}}"no-prototype"{{.*}} }
diff --git a/test/CodeGen/nonnull.c b/test/CodeGen/nonnull.c
index 7c33e6329fdd..30162441cf4c 100644
--- a/test/CodeGen/nonnull.c
+++ b/test/CodeGen/nonnull.c
@@ -1,32 +1,39 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm < %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm < %s | FileCheck -check-prefix=NULL-INVALID %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -fno-delete-null-pointer-checks < %s | FileCheck -check-prefix=NULL-VALID %s
 
-// CHECK: define void @foo(i32* nonnull %x)
+// NULL-INVALID: define void @foo(i32* nonnull %x)
+// NULL-VALID: define void @foo(i32* %x)
 void foo(int * __attribute__((nonnull)) x) {
   *x = 0;
 }
 
-// CHECK: define void @bar(i32* nonnull %x)
+// NULL-INVALID: define void @bar(i32* nonnull %x)
+// NULL-VALID: define void @bar(i32* %x)
 void bar(int * x) __attribute__((nonnull(1)))  {
   *x = 0;
 }
 
-// CHECK: define void @bar2(i32* %x, i32* nonnull %y)
+// NULL-INVALID: define void @bar2(i32* %x, i32* nonnull %y)
+// NULL-VALID: define void @bar2(i32* %x, i32* %y)
 void bar2(int * x, int * y) __attribute__((nonnull(2)))  {
   *x = 0;
 }
 
 static int a;
-// CHECK: define nonnull i32* @bar3()
+// NULL-INVALID: define nonnull i32* @bar3()
+// NULL-VALID: define i32* @bar3()
 int * bar3() __attribute__((returns_nonnull))  {
   return &a;
 }
 
-// CHECK: define i32 @bar4(i32 %n, i32* nonnull %p)
+// NULL-INVALID: define i32 @bar4(i32 %n, i32* nonnull %p)
+// NULL-VALID: define i32 @bar4(i32 %n, i32* %p)
 int bar4(int n, int *p) __attribute__((nonnull)) {
   return n + *p;
 }
 
-// CHECK: define i32 @bar5(i32 %n, i32* nonnull %p)
+// NULL-INVALID: define i32 @bar5(i32 %n, i32* nonnull %p)
+// NULL-VALID: define i32 @bar5(i32 %n, i32* %p)
 int bar5(int n, int *p) __attribute__((nonnull(1, 2))) {
   return n + *p;
 }
@@ -37,15 +44,18 @@ typedef union {
   double d;
 } TransparentUnion __attribute__((transparent_union));
 
-// CHECK: define i32 @bar6(i64 %
+// NULL-INVALID: define i32 @bar6(i64 %
+// NULL-VALID: define i32 @bar6(i64 %
 int bar6(TransparentUnion tu) __attribute__((nonnull(1))) {
   return *tu.p;
 }
 
-// CHECK: define void @bar7(i32* nonnull %a, i32* nonnull %b)
+// NULL-INVALID: define void @bar7(i32* nonnull %a, i32* nonnull %b)
+// NULL-VALID: define void @bar7(i32* %a, i32* %b)
 void bar7(int *a, int *b) __attribute__((nonnull(1)))
 __attribute__((nonnull(2))) {}
 
-// CHECK: define void @bar8(i32* nonnull %a, i32* nonnull %b)
+// NULL-INVALID: define void @bar8(i32* nonnull %a, i32* nonnull %b)
+// NULL-VALID: define void @bar8(i32* %a, i32* %b)
 void bar8(int *a, int *b) __attribute__((nonnull))
 __attribute__((nonnull(1))) {}
diff --git a/test/CodeGen/noplt.c b/test/CodeGen/noplt.c
index f467199efab2..20e2e07b0fd4 100644
--- a/test/CodeGen/noplt.c
+++ b/test/CodeGen/noplt.c
@@ -1,7 +1,8 @@
-// RUN: %clang_cc1 -emit-llvm -fno-plt %s -o - | FileCheck %s -check-prefix=CHECK-NOPLT
+// RUN: %clang_cc1 -emit-llvm -fno-plt %s -o - | FileCheck %s -check-prefix=CHECK-NOPLT -check-prefix=CHECK-NOPLT-METADATA
 
 // CHECK-NOPLT: Function Attrs: nonlazybind
 // CHECK-NOPLT-NEXT: declare {{.*}}i32 @foo
+// CHECK-NOPLT-METADATA: !"RtLibUseGOT"
 int foo();
 
 int bar() {
diff --git a/test/CodeGen/opt-record-MIR.c b/test/CodeGen/opt-record-MIR.c
index 00b91ffdf3cb..37239281e9e9 100644
--- a/test/CodeGen/opt-record-MIR.c
+++ b/test/CodeGen/opt-record-MIR.c
@@ -21,8 +21,9 @@ void foo(float *p, int i) {
 // YAML: --- !Missed
 // YAML: Pass:            regalloc
 // YAML: Name:            LoopSpillReload
-// YAML: DebugLoc:        { File: {{.*}},
-// YAML:                    Line: 10, Column: 11 }
+// YAML: DebugLoc:        { File: {{[^,]+}},
+// YAML:                    Line: 10,
+// YAML:                    Column: 11 }
 // YAML: Function:        foo
 // YAML: Args:
 // YAML:   - NumSpills:       '{{.}}'
diff --git a/test/CodeGen/overloadable.c b/test/CodeGen/overloadable.c
index 1feb12f5b579..f4fd0b36308b 100644
--- a/test/CodeGen/overloadable.c
+++ b/test/CodeGen/overloadable.c
@@ -41,7 +41,7 @@ void addrof_single(int *a) __attribute__((overloadable, enable_if(0, "")));
 void addrof_single(char *a) __attribute__((overloadable, enable_if(0, "")));
 void addrof_single(char *a) __attribute__((overloadable));
 
-// CHECK-LABEL: define void @foo
+// CHECK-LABEL: define {{(dso_local )?}}void @foo
 void foo() {
   // CHECK: store void (i8*)* @_Z11addrof_manyPc
   void (*p1)(char *) = &addrof_many;
@@ -64,7 +64,7 @@ void foo() {
 void ovl_bar(char *) __attribute__((overloadable));
 void ovl_bar(int) __attribute__((overloadable));
 
-// CHECK-LABEL: define void @bar
+// CHECK-LABEL: define {{(dso_local )?}}void @bar
 void bar() {
   char charbuf[1];
   unsigned char ucharbuf[1];
@@ -79,7 +79,7 @@ void ovl_baz(int *, int) __attribute__((overloadable));
 void ovl_baz(unsigned int *, unsigned int) __attribute__((overloadable));
 void ovl_baz2(int, int *) __attribute__((overloadable));
 void ovl_baz2(unsigned int, unsigned int *) __attribute__((overloadable));
-// CHECK-LABEL: define void @baz
+// CHECK-LABEL: define {{(dso_local )?}}void @baz
 void baz() {
   unsigned int j;
   // Initial rules for incompatible pointer conversions made this overload
diff --git a/test/CodeGen/packed-nest-unpacked.c b/test/CodeGen/packed-nest-unpacked.c
index e2bbd41a9daf..4124a8ade86a 100644
--- a/test/CodeGen/packed-nest-unpacked.c
+++ b/test/CodeGen/packed-nest-unpacked.c
@@ -9,26 +9,26 @@ struct X foo(void);
 // <rdar://problem/10463337>
 struct X test1() {
   // CHECK: @test1
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
   return g.y;
 }
 struct X test2() {
   // CHECK: @test2
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
   struct X a = g.y;
   return a;
 }
 
 void test3(struct X a) {
   // CHECK: @test3
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* {{.*}}, i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* {{.*}}, i64 24, i1 false)
   g.y = a;
 }
 
 // <rdar://problem/10530444>
 void test4() {
   // CHECK: @test4
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
   f(g.y);
 }
 
@@ -42,7 +42,7 @@ int test5() {
 // <rdar://problem/11220251>
 void test6() {
   // CHECK: @test6
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* %{{.*}}, i64 24, i32 1, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* align 4 %{{.*}}, i64 24, i1 false)
   g.y = foo();
 }
 
diff --git a/test/CodeGen/packed-structure.c b/test/CodeGen/packed-structure.c
index 7d1183dc5ca6..91cacb400536 100644
--- a/test/CodeGen/packed-structure.c
+++ b/test/CodeGen/packed-structure.c
@@ -29,7 +29,7 @@ int s0_load_x(struct s0 *a) { return a->x; }
 // CHECK-FUNCTIONS: ret i32 [[s0_load_y]]
 int s0_load_y(struct s0 *a) { return a->y; }
 // CHECK-FUNCTIONS-LABEL: define void @s0_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 4, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 {{.*}}, i8* align 4 {{.*}}, i64 8, i1 false)
 void s0_copy(struct s0 *a, struct s0 *b) { *b = *a; }
 
 //
@@ -55,7 +55,7 @@ int s1_load_x(struct s1 *a) { return a->x; }
 // CHECK-FUNCTIONS: ret i32 [[s1_load_y]]
 int s1_load_y(struct s1 *a) { return a->y; }
 // CHECK-FUNCTIONS-LABEL: define void @s1_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 1, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i1 false)
 void s1_copy(struct s1 *a, struct s1 *b) { *b = *a; }
 
 //
@@ -83,7 +83,7 @@ int s2_load_x(struct s2 *a) { return a->x; }
 // CHECK-FUNCTIONS: ret i32 [[s2_load_y]]
 int s2_load_y(struct s2 *a) { return a->y; }
 // CHECK-FUNCTIONS-LABEL: define void @s2_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 2, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 {{.*}}, i8* align 2 {{.*}}, i64 8, i1 false)
 void s2_copy(struct s2 *a, struct s2 *b) { *b = *a; }
 
 struct __attribute__((packed, aligned)) s3 {
diff --git a/test/CodeGen/partial-reinitialization2.c b/test/CodeGen/partial-reinitialization2.c
index c4f6567b36ce..9c3ce99b98dc 100644
--- a/test/CodeGen/partial-reinitialization2.c
+++ b/test/CodeGen/partial-reinitialization2.c
@@ -15,7 +15,7 @@ union ULP3 { struct LP3 l3; };
 // CHECK-LABEL: test1
 void test1(void)
 {
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
   // CHECK: store i8 120, i8* %
 
   struct LP1 l = { .p1 = g1, .p1.x[2] = 'x' };
@@ -24,7 +24,7 @@ void test1(void)
 // CHECK-LABEL: test2
 void test2(void)
 {
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
   // CHECK: store i8 114, i8* %
 
   struct LP1 l = { .p1 = g1, .p1.x[1] = 'r' };
@@ -33,7 +33,7 @@ void test2(void)
 // CHECK-LABEL: test3
 void test3(void)
 {
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P2* @g2{{.*}}i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P2* @g2{{.*}}i64 12, i1 false)
   // CHECK: store i32 10, i32* %
 
   struct LP2 l = { .p2 = g2, .p2.b = 10 };
@@ -66,7 +66,7 @@ void test4(void)
   // CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get123()
   // CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9]+]]
   // CHECK: [[TMP1:%[a-z0-9]+]] = bitcast {{.*}}[[TMP0]]
-  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i1 false)
   // CHECK: store i32 100, i32* %
 
   struct LUP2 { union UP2 up; } var = { get123(), .up.p2.a = 100 };
@@ -76,12 +76,12 @@ void test4(void)
 void test5(void)
 {
   // .l3 = g3
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.LP3, %struct.LP3* @g3{{.*}}i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.LP3, %struct.LP3* @g3{{.*}}i64 12, i1 false)
 
   // .l3.p1 = { [0] = g1 } implicitly sets [1] to zero
-  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
   // CHECK: getelementptr{{.*}}%struct.P1, %struct.P1*{{.*}}i64 1
-  // CHECK: call void @llvm.memset{{.*}}i8 0, i64 6, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memset{{.*}}i8 0, i64 6, i1 false)
 
   // .l3.p1[1].x[1] = 'x'
   // CHECK: store i8 120, i8* %
@@ -98,7 +98,7 @@ void test6(void)
   // CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get235()
   // CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9]+]]
   // CHECK: [[TMP1:%[a-z0-9]+]] = bitcast {{.*}}[[TMP0]]
-  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i32 {{[0-9]}}, i1 false)
+  // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i1 false)
 
   // CHECK: store i32 10, i32* %
 
diff --git a/test/CodeGen/pch-dllexport.cpp b/test/CodeGen/pch-dllexport.cpp
new file mode 100644
index 000000000000..9c6623be4362
--- /dev/null
+++ b/test/CodeGen/pch-dllexport.cpp
@@ -0,0 +1,84 @@
+// Build PCH without object file, then use it.
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCH %s
+
+// Build PCH with object file, then use it.
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-pch -building-pch-with-obj -o %t %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-obj -emit-llvm -include-pch %t -building-pch-with-obj -o - %s | FileCheck -check-prefix=OBJ %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJ %s
+
+// Check for vars separately to avoid having to reorder the check statements.
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-obj -emit-llvm -include-pch %t -o - %s | FileCheck -check-prefix=PCHWITHOBJVARS %s
+
+#ifndef IN_HEADER
+#define IN_HEADER
+
+inline void __declspec(dllexport) foo() {}
+// OBJ: define weak_odr dso_local dllexport void @"?foo@@YAXXZ"
+// PCH: define weak_odr dso_local dllexport void @"?foo@@YAXXZ"
+// PCHWITHOBJ-NOT: define {{.*}}foo
+
+
+// This function is referenced, so gets emitted as usual.
+inline void __declspec(dllexport) baz() {}
+// OBJ: define weak_odr dso_local dllexport void @"?baz@@YAXXZ"
+// PCH: define weak_odr dso_local dllexport void @"?baz@@YAXXZ"
+// PCHWITHOBJ: define weak_odr dso_local dllexport void @"?baz@@YAXXZ"
+
+
+struct __declspec(dllexport) S {
+  void bar() {}
+// OBJ: define weak_odr dso_local dllexport x86_thiscallcc void @"?bar@S@@QAEXXZ"
+// PCH: define weak_odr dso_local dllexport x86_thiscallcc void @"?bar@S@@QAEXXZ"
+// PCHWITHOBJ-NOT: define {{.*}}bar
+};
+
+// This isn't dllexported, attribute((used)) or referenced, so not emitted.
+inline void quux() {}
+// OBJ-NOT: define {{.*}}quux
+// PCH-NOT: define {{.*}}quux
+// PCHWITHOBJ-NOT: define {{.*}}quux
+
+// Referenced non-dllexport function.
+inline void referencedNonExported() {}
+// OBJ: define {{.*}}referencedNonExported
+// PCH: define {{.*}}referencedNonExported
+// PCHWITHOBJ: define {{.*}}referencedNonExported
+
+template <typename T> void __declspec(dllexport) implicitInstantiation(T) {}
+
+template <typename T> inline void __declspec(dllexport) explicitSpecialization(T) {}
+
+template <typename T> void __declspec(dllexport) explicitInstantiationDef(T) {}
+
+template <typename T> void __declspec(dllexport) explicitInstantiationDefAfterDecl(T) {}
+extern template void explicitInstantiationDefAfterDecl<int>(int);
+
+template <typename T> T __declspec(dllexport) variableTemplate;
+extern template int variableTemplate<int>;
+
+#else
+
+void use() {
+  baz();
+  referencedNonExported();
+}
+
+// Templates can be tricky. None of the definitions below come from the PCH.
+
+void useTemplate() { implicitInstantiation(42); }
+// PCHWITHOBJ: define weak_odr dso_local dllexport void @"??$implicitInstantiation@H@@YAXH@Z"
+
+template<> inline void __declspec(dllexport) explicitSpecialization<int>(int) {}
+// PCHWITHOBJ: define weak_odr dso_local  dllexport void @"??$explicitSpecialization@H@@YAXH@Z"
+
+template void __declspec(dllexport) explicitInstantiationDef<int>(int);
+// PCHWITHOBJ: define weak_odr dso_local dllexport void @"??$explicitInstantiationDef@H@@YAXH@Z"
+
+template void __declspec(dllexport) explicitInstantiationDefAfterDecl<int>(int);
+// PCHWITHOBJ: define weak_odr dso_local dllexport void @"??$explicitInstantiationDefAfterDecl@H@@YAXH@Z"(i32)
+
+template int __declspec(dllexport) variableTemplate<int>;
+// PCHWITHOBJVARS: @"??$variableTemplate@H@@3HA" = weak_odr dso_local dllexport global
+
+#endif
diff --git a/test/CodeGen/personality.c b/test/CodeGen/personality.c
new file mode 100644
index 000000000000..1c533ce786c0
--- /dev/null
+++ b/test/CodeGen/personality.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fdwarf-exceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-DWARF
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fseh-exceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SEH
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fexceptions -fsjlj-exceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SJLJ
+
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN
+// RUN: %clang_cc1 -triple i686-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X86
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -D __SEH_EXCEPTIONS__ -fms-extensions -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-WIN-SEH-X64
+
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-GNU
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fdwarf-exceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-DWARF
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fseh-exceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SEH
+// RUN: %clang_cc1 -triple i686-unknown-windows-gnu -fexceptions -fsjlj-exceptions -fblocks -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-SJLJ
+
+
+extern void g(void (^)(void));
+extern void i(void);
+
+// CHECK-GNU: personality i8* bitcast (i32 (...)* @__gcc_personality_v0 to i8*)
+// CHECK-DWARF: personality i8* bitcast (i32 (...)* @__gcc_personality_v0 to i8*)
+// CHECK-SEH: personality i8* bitcast (i32 (...)* @__gcc_personality_seh0 to i8*)
+// CHECK-SJLJ: personality i8* bitcast (i32 (...)* @__gcc_personality_sj0 to i8*)
+
+// CHECK-WIN: personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+// CHECK-MINGW-SEH: personality i8* bitcast (i32 (...)* @__gcc_personality_seh0 to i8*)
+
+void f(void) {
+  __block int i;
+  g(^ { });
+}
+
+#if defined(__SEH_EXCEPTIONS__)
+// CHECK-WIN-SEH-X86: personality i8* bitcast (i32 (...)* @_except_handler3 to i8*)
+// CHECK-WIN-SEH-X64: personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
+
+void h(void) {
+  __try {
+    i();
+  } __finally {
+  }
+}
+#endif
+
diff --git a/test/CodeGen/pgo-sample-thinlto-summary.c b/test/CodeGen/pgo-sample-thinlto-summary.c
index 7045db08f22e..eae35a040e5f 100644
--- a/test/CodeGen/pgo-sample-thinlto-summary.c
+++ b/test/CodeGen/pgo-sample-thinlto-summary.c
@@ -13,8 +13,8 @@ void foo(int n) {
     g += baz(i);
 }
 
-// SAMPLEPGO-LABEL: define void @bar
-// THINLTO-LABEL: define void @bar
+// SAMPLEPGO-LABEL: define {{(dso_local )?}}void @bar
+// THINLTO-LABEL: define {{(dso_local )?}}void @bar
 // SAMPLEPGO-NOT: call{{.*}}foo
 // THINLTO: call{{.*}}foo
 void bar(int n) {
@@ -23,8 +23,8 @@ void bar(int n) {
 }
 
 // Checks if loop unroll is invoked by normal compile, but not thinlto compile.
-// SAMPLEPGO-LABEL: define void @unroll
-// THINLTO-LABEL: define void @unroll
+// SAMPLEPGO-LABEL: define {{(dso_local )?}}void @unroll
+// THINLTO-LABEL: define {{(dso_local )?}}void @unroll
 // SAMPLEPGO: call{{.*}}baz
 // SAMPLEPGO: call{{.*}}baz
 // THINLTO: call{{.*}}baz
@@ -35,8 +35,8 @@ void unroll() {
 }
 
 // Checks that icp is not invoked for ThinLTO, but invoked for normal samplepgo.
-// SAMPLEPGO-LABEL: define void @icp
-// THINLTO-LABEL: define void @icp
+// SAMPLEPGO-LABEL: define {{(dso_local )?}}void @icp
+// THINLTO-LABEL: define {{(dso_local )?}}void @icp
 // SAMPLEPGO: if.true.direct_targ
 // FIXME: the following condition needs to be reversed once
 //        LTOPreLinkDefaultPipeline is customized.
diff --git a/test/CodeGen/popcnt-builtins.c b/test/CodeGen/popcnt-builtins.c
index 656a20f98605..1fdb43339a80 100644
--- a/test/CodeGen/popcnt-builtins.c
+++ b/test/CodeGen/popcnt-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +popcnt -emit-llvm -o - | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 unsigned int test_mm_popcnt_u32(unsigned int __X) {
   //CHECK: call i32 @llvm.ctpop.i32
diff --git a/test/CodeGen/ppc-varargs-struct.c b/test/CodeGen/ppc-varargs-struct.c
index d7936a126960..c201074e1fd9 100644
--- a/test/CodeGen/ppc-varargs-struct.c
+++ b/test/CodeGen/ppc-varargs-struct.c
@@ -54,7 +54,7 @@ void testva (int n, ...)
 // CHECK-PPC-NEXT:  [[AGGR:%[a-z0-9]+]] = load %struct.x*, %struct.x** [[VAARG_ADDR]]
 // CHECK-PPC-NEXT:  [[DEST:%[0-9]+]] = bitcast %struct.x* %t to i8*
 // CHECK-PPC-NEXT:  [[SRC:%.+]] = bitcast %struct.x* [[AGGR]] to i8*
-// CHECK-PPC-NEXT:  call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DEST]], i8* [[SRC]], i32 16, i32 8, i1 false)
+// CHECK-PPC-NEXT:  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i1 false)
 
   int v = va_arg (ap, int);
   
diff --git a/test/CodeGen/ppc64-align-struct.c b/test/CodeGen/ppc64-align-struct.c
index 5894a6aeb379..5e2dc8b2eb8a 100644
--- a/test/CodeGen/ppc64-align-struct.c
+++ b/test/CodeGen/ppc64-align-struct.c
@@ -48,15 +48,14 @@ void test7 (int x, struct test7 y)
 {
 }
 
-// CHECK-LABEL: define void @test1va(%struct.test1* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test1, align 4
+// CHECK: define void @test1va(%struct.test1* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 8
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test1*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test1* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test1* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test1* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 8, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 8 [[SRC]], i64 8, i1 false)
 struct test1 test1va (int x, ...)
 {
   struct test1 y;
@@ -67,8 +66,7 @@ struct test1 test1va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @test2va(%struct.test2* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test2, align 16
+// CHECK: define void @test2va(%struct.test2* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64
 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15
@@ -77,9 +75,9 @@ struct test1 test1va (int x, ...)
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[ALIGN]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test2*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test2* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test2* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test2* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
 struct test2 test2va (int x, ...)
 {
   struct test2 y;
@@ -90,8 +88,7 @@ struct test2 test2va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @test3va(%struct.test3* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test3, align 32
+// CHECK: define void @test3va(%struct.test3* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64
 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15
@@ -100,9 +97,9 @@ struct test2 test2va (int x, ...)
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[ALIGN]], i64 32
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test3*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test3* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test3* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test3* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 32 [[DEST]], i8* align 16 [[SRC]], i64 32, i1 false)
 struct test3 test3va (int x, ...)
 {
   struct test3 y;
@@ -113,15 +110,14 @@ struct test3 test3va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @test4va(%struct.test4* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test4, align 4
+// CHECK: define void @test4va(%struct.test4* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test4*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test4* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test4* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test4* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 8 [[SRC]], i64 12, i1 false)
 struct test4 test4va (int x, ...)
 {
   struct test4 y;
@@ -132,15 +128,14 @@ struct test4 test4va (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @testva_longdouble(%struct.test_longdouble* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test_longdouble, align 16
+// CHECK: define void @testva_longdouble(%struct.test_longdouble* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test_longdouble*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test_longdouble* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test_longdouble* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test_longdouble* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 8 [[SRC]], i64 16, i1 false)
 struct test_longdouble { long double x; };
 struct test_longdouble testva_longdouble (int x, ...)
 {
@@ -152,8 +147,7 @@ struct test_longdouble testva_longdouble (int x, ...)
   return y;
 }
 
-// CHECK-LABEL: define void @testva_vector(%struct.test_vector* noalias sret %agg.result, i32 signext %x, ...)
-// CHECK: %y = alloca %struct.test_vector, align 16
+// CHECK: define void @testva_vector(%struct.test_vector* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64
 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15
@@ -162,9 +156,9 @@ struct test_longdouble testva_longdouble (int x, ...)
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[ALIGN]], i64 16
 // CHECK: store i8* %[[NEXT]], i8** %ap
 // CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test_vector*
-// CHECK: [[DEST:%.*]] = bitcast %struct.test_vector* %y to i8*
+// CHECK: [[DEST:%.*]] = bitcast %struct.test_vector* %[[AGG_RESULT]] to i8*
 // CHECK: [[SRC:%.*]] = bitcast %struct.test_vector* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
 struct test_vector { vector int x; };
 struct test_vector testva_vector (int x, ...)
 {
diff --git a/test/CodeGen/ppc64-soft-float.c b/test/CodeGen/ppc64-soft-float.c
index 95b18292319d..84ac2d55b636 100644
--- a/test/CodeGen/ppc64-soft-float.c
+++ b/test/CodeGen/ppc64-soft-float.c
@@ -92,7 +92,7 @@ void call_f2(void) { global_f2 = func_f2(global_f2); }
 // CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f3, align 4
 // CHECK: %[[TMP1:[^ ]+]] = alloca [2 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f3* @global_f3 to i8*), i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f3* @global_f3 to i8*), i64 12, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP1]]
 // CHECK-LE: call { i64, i64 } @func_f3([2 x i64] %[[TMP3]])
 // CHECK-BE: call void @func_f3(%struct.f3* sret %[[TMP0]], [2 x i64] %[[TMP3]])
@@ -111,7 +111,7 @@ void call_f4(void) { global_f4 = func_f4(global_f4); }
 // CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f5, align 4
 // CHECK: %[[TMP1:[^ ]+]] = alloca [3 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [3 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f5* @global_f5 to i8*), i64 20, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f5* @global_f5 to i8*), i64 20, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [3 x i64], [3 x i64]* %[[TMP1]]
 // CHECK: call void @func_f5(%struct.f5* sret %[[TMP0]], [3 x i64] %[[TMP3]])
 struct f5 global_f5;
@@ -128,7 +128,7 @@ void call_f6(void) { global_f6 = func_f6(global_f6); }
 // CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f7, align 4
 // CHECK: %[[TMP1:[^ ]+]] = alloca [4 x i64], align 8
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [4 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f7* @global_f7 to i8*), i64 28, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f7* @global_f7 to i8*), i64 28, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [4 x i64], [4 x i64]* %[[TMP1]], align 8
 // CHECK: call void @func_f7(%struct.f7* sret %[[TMP0]], [4 x i64] %[[TMP3]])
 struct f7 global_f7;
@@ -144,7 +144,7 @@ void call_f8(void) { global_f8 = func_f8(global_f8); }
 // CHECK-LABEL: @call_f9
 // CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]]
 // CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
 struct f9 global_f9;
@@ -162,7 +162,7 @@ void call_fab(void) { global_fab = func_fab(global_fab); }
 // CHECK-BE: %[[TMPX:[^ ]+]] = alloca %struct.fabc, align 4
 // CHECK: %[[TMP0:[^ ]+]] = alloca [2 x i64], align 8
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP0]], align 8
 // CHECK-LE: %call = call { i64, i64 } @func_fabc([2 x i64] %[[TMP3]])
 // CHECK-BE: call void @func_fabc(%struct.fabc* sret %[[TMPX]], [2 x i64] %[[TMP3]])
diff --git a/test/CodeGen/ppc64le-aggregates.c b/test/CodeGen/ppc64le-aggregates.c
index f78f26a59285..a7780beec5f6 100644
--- a/test/CodeGen/ppc64le-aggregates.c
+++ b/test/CodeGen/ppc64le-aggregates.c
@@ -104,7 +104,7 @@ void call_f8(void) { global_f8 = func_f8(global_f8); }
 // CHECK-LABEL: @call_f9
 // CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
 // CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false)
 // CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]]
 // CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
 struct f9 global_f9;
diff --git a/test/CodeGen/ppc64le-f128Aggregates.c b/test/CodeGen/ppc64le-f128Aggregates.c
new file mode 100644
index 000000000000..a51b6d5e709c
--- /dev/null
+++ b/test/CodeGen/ppc64le-f128Aggregates.c
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm \
+// RUN:   -target-cpu pwr9 -target-feature +float128 -o - %s | FileCheck %s
+
+// Test homogeneous fp128 aggregate passing and returning.
+
+struct fp1 { __float128 f[1]; };
+struct fp2 { __float128 f[2]; };
+struct fp3 { __float128 f[3]; };
+struct fp4 { __float128 f[4]; };
+struct fp5 { __float128 f[5]; };
+struct fp6 { __float128 f[6]; };
+struct fp7 { __float128 f[7]; };
+struct fp8 { __float128 f[8]; };
+struct fp9 { __float128 f[9]; };
+
+struct fpab { __float128 a; __float128 b; };
+struct fpabc { __float128 a; __float128 b; __float128 c; };
+
+struct fp2a2b { __float128 a[2]; __float128 b[2]; };
+
+// CHECK: define [1 x fp128] @func_f1(fp128 inreg %x.coerce)
+struct fp1 func_f1(struct fp1 x) { return x; }
+
+// CHECK: define [2 x fp128] @func_f2([2 x fp128] %x.coerce)
+struct fp2 func_f2(struct fp2 x) { return x; }
+
+// CHECK: define [3 x fp128] @func_f3([3 x fp128] %x.coerce)
+struct fp3 func_f3(struct fp3 x) { return x; }
+
+// CHECK: define [4 x fp128] @func_f4([4 x fp128] %x.coerce)
+struct fp4 func_f4(struct fp4 x) { return x; }
+
+// CHECK: define [5 x fp128] @func_f5([5 x fp128] %x.coerce)
+struct fp5 func_f5(struct fp5 x) { return x; }
+
+// CHECK: define [6 x fp128] @func_f6([6 x fp128] %x.coerce)
+struct fp6 func_f6(struct fp6 x) { return x; }
+
+// CHECK: define [7 x fp128] @func_f7([7 x fp128] %x.coerce)
+struct fp7 func_f7(struct fp7 x) { return x; }
+
+// CHECK: define [8 x fp128] @func_f8([8 x fp128] %x.coerce)
+struct fp8 func_f8(struct fp8 x) { return x; }
+
+// CHECK: define void @func_f9(%struct.fp9* noalias sret %agg.result, %struct.fp9* byval align 16 %x)
+struct fp9 func_f9(struct fp9 x) { return x; }
+
+// CHECK: define [2 x fp128] @func_fab([2 x fp128] %x.coerce)
+struct fpab func_fab(struct fpab x) { return x; }
+
+// CHECK: define [3 x fp128] @func_fabc([3 x fp128] %x.coerce)
+struct fpabc func_fabc(struct fpabc x) { return x; }
+
+// CHECK: define [4 x fp128] @func_f2a2b([4 x fp128] %x.coerce)
+struct fp2a2b func_f2a2b(struct fp2a2b x) { return x; }
+
+// CHECK-LABEL: @call_fp1
+// CHECK: %[[TMP:[^ ]+]] = load fp128, fp128* getelementptr inbounds (%struct.fp1, %struct.fp1* @global_f1, i32 0, i32 0, i32 0), align 16
+// CHECK: call [1 x fp128] @func_f1(fp128 inreg %[[TMP]])
+struct fp1 global_f1;
+void call_fp1(void) { global_f1 = func_f1(global_f1); }
+
+// CHECK-LABEL: @call_fp2
+// CHECK: %[[TMP:[^ ]+]] = load [2 x fp128], [2 x fp128]* getelementptr inbounds (%struct.fp2, %struct.fp2* @global_f2, i32 0, i32 0), align 16
+// CHECK: call [2 x fp128] @func_f2([2 x fp128] %[[TMP]])
+struct fp2 global_f2;
+void call_fp2(void) { global_f2 = func_f2(global_f2); }
+
+// CHECK-LABEL: @call_fp3
+// CHECK: %[[TMP:[^ ]+]] = load [3 x fp128], [3 x fp128]* getelementptr inbounds (%struct.fp3, %struct.fp3* @global_f3, i32 0, i32 0), align 16
+// CHECK: call [3 x fp128] @func_f3([3 x fp128] %[[TMP]])
+struct fp3 global_f3;
+void call_fp3(void) { global_f3 = func_f3(global_f3); }
+
+// CHECK-LABEL: @call_fp4
+// CHECK: %[[TMP:[^ ]+]] = load [4 x fp128], [4 x fp128]* getelementptr inbounds (%struct.fp4, %struct.fp4* @global_f4, i32 0, i32 0), align 16
+// CHECK: call [4 x fp128] @func_f4([4 x fp128] %[[TMP]])
+struct fp4 global_f4;
+void call_fp4(void) { global_f4 = func_f4(global_f4); }
+
+// CHECK-LABEL: @call_fp5
+// CHECK: %[[TMP:[^ ]+]] = load [5 x fp128], [5 x fp128]* getelementptr inbounds (%struct.fp5, %struct.fp5* @global_f5, i32 0, i32 0), align 16
+// CHECK: call [5 x fp128] @func_f5([5 x fp128] %[[TMP]])
+struct fp5 global_f5;
+void call_fp5(void) { global_f5 = func_f5(global_f5); }
+
+// CHECK-LABEL: @call_fp6
+// CHECK: %[[TMP:[^ ]+]] = load [6 x fp128], [6 x fp128]* getelementptr inbounds (%struct.fp6, %struct.fp6* @global_f6, i32 0, i32 0), align 16
+// CHECK: call [6 x fp128] @func_f6([6 x fp128] %[[TMP]])
+struct fp6 global_f6;
+void call_fp6(void) { global_f6 = func_f6(global_f6); }
+
+// CHECK-LABEL: @call_fp7
+// CHECK: %[[TMP:[^ ]+]] = load [7 x fp128], [7 x fp128]* getelementptr inbounds (%struct.fp7, %struct.fp7* @global_f7, i32 0, i32 0), align 16
+// CHECK: call [7 x fp128] @func_f7([7 x fp128] %[[TMP]])
+struct fp7 global_f7;
+void call_fp7(void) { global_f7 = func_f7(global_f7); }
+
+// CHECK-LABEL: @call_fp8
+// CHECK: %[[TMP:[^ ]+]] = load [8 x fp128], [8 x fp128]* getelementptr inbounds (%struct.fp8, %struct.fp8* @global_f8, i32 0, i32 0), align 16
+// CHECK: call [8 x fp128] @func_f8([8 x fp128] %[[TMP]])
+struct fp8 global_f8;
+void call_fp8(void) { global_f8 = func_f8(global_f8); }
+
+// CHECK-LABEL: @call_fp9
+// CHECK: %[[TMP1:[^ ]+]] = alloca %struct.fp9, align 16
+// CHECK: call void @func_f9(%struct.fp9* sret %[[TMP2:[^ ]+]], %struct.fp9* byval align 16 @global_f9
+// CHECK: %[[TMP3:[^ ]+]] = bitcast %struct.fp9* %[[TMP2]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 bitcast (%struct.fp9* @global_f9 to i8*), i8* align 16 %[[TMP3]], i64 144, i1 false
+// CHECK: ret void
+struct fp9 global_f9;
+void call_fp9(void) { global_f9 = func_f9(global_f9); }
+
+// CHECK-LABEL: @call_fpab
+// CHECK: %[[TMP:[^ ]+]] = load [2 x fp128], [2 x fp128]* bitcast (%struct.fpab* @global_fab to [2 x fp128]*)
+// CHECK: call [2 x fp128] @func_fab([2 x fp128] %[[TMP]])
+struct fpab global_fab;
+void call_fpab(void) { global_fab = func_fab(global_fab); }
+
+// CHECK-LABEL: @call_fpabc
+// CHECK: %[[TMP:[^ ]+]] = load [3 x fp128], [3 x fp128]* bitcast (%struct.fpabc* @global_fabc to [3 x fp128]*)
+// CHECK: call [3 x fp128] @func_fabc([3 x fp128] %[[TMP]])
+struct fpabc global_fabc;
+void call_fpabc(void) { global_fabc = func_fabc(global_fabc); }
diff --git a/test/CodeGen/pr19841.cpp b/test/CodeGen/pr19841.cpp
index 8fef68384fc1..282b2e6d000b 100644
--- a/test/CodeGen/pr19841.cpp
+++ b/test/CodeGen/pr19841.cpp
@@ -12,7 +12,7 @@ class A {
   unsigned char _highlightColorTableVGA[];
   static const unsigned char b[];
 };
-// CHECK: [[Common_A_b:@[^ ]+]] = constant [1 x i8] zeroinitializer
+// CHECK: [[Common_A_b:@[^ ]+]] = {{(dso_local )?}}constant [1 x i8] zeroinitializer
 class B {
 public:
   Common::RenderMode _configRenderMode;
diff --git a/test/CodeGen/pr3518.c b/test/CodeGen/pr3518.c
index 5ce6a6570564..f0c06d9be6dc 100644
--- a/test/CodeGen/pr3518.c
+++ b/test/CodeGen/pr3518.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
 // PR 3518
-// Some of the objects were coming out as unintialized (external) before 3518
+// Some of the objects were coming out as uninitialized (external) before 3518
 // was fixed.  Internal names are different between llvm-gcc and clang so they
 // are not tested.
 
diff --git a/test/CodeGen/pr4349.c b/test/CodeGen/pr4349.c
index e39dc2c517f4..9ec6f2fac78a 100644
--- a/test/CodeGen/pr4349.c
+++ b/test/CodeGen/pr4349.c
@@ -16,22 +16,22 @@ struct svar
 {
     void *ptr;
 };
-// CHECK: @svars1 = global [1 x %struct.svar] [%struct.svar { i8* bitcast (%struct.cpu* @cpu to i8*) }]
+// CHECK: @svars1 = {{(dso_local )?}}global [1 x %struct.svar] [%struct.svar { i8* bitcast (%struct.cpu* @cpu to i8*) }]
 struct svar svars1[] =
 {
     { &((cpu.pc).w[0]) }
 };
-// CHECK: @svars2 = global [1 x %struct.svar] [%struct.svar { i8* getelementptr (i8, i8* bitcast (%struct.cpu* @cpu to i8*), i64 1) }]
+// CHECK: @svars2 = {{(dso_local )?}}global [1 x %struct.svar] [%struct.svar { i8* getelementptr (i8, i8* bitcast (%struct.cpu* @cpu to i8*), i64 1) }]
 struct svar svars2[] =
 {
     { &((cpu.pc).b[0][1]) }
 };
-// CHECK: @svars3 = global [1 x %struct.svar] [%struct.svar { i8* getelementptr (i8, i8* bitcast (%struct.cpu* @cpu to i8*), i64 2) }]
+// CHECK: @svars3 = {{(dso_local )?}}global [1 x %struct.svar] [%struct.svar { i8* getelementptr (i8, i8* bitcast (%struct.cpu* @cpu to i8*), i64 2) }]
 struct svar svars3[] =
 {
     { &((cpu.pc).w[1]) }
 };
-// CHECK: @svars4 = global [1 x %struct.svar] [%struct.svar { i8* getelementptr (i8, i8* bitcast (%struct.cpu* @cpu to i8*), i64 3) }]
+// CHECK: @svars4 = {{(dso_local )?}}global [1 x %struct.svar] [%struct.svar { i8* getelementptr (i8, i8* bitcast (%struct.cpu* @cpu to i8*), i64 3) }]
 struct svar svars4[] =
 {
     { &((cpu.pc).b[1][1]) }
diff --git a/test/CodeGen/pragma-comment.c b/test/CodeGen/pragma-comment.c
index fae9b8fb9ed8..1896e5c0fb98 100644
--- a/test/CodeGen/pragma-comment.c
+++ b/test/CodeGen/pragma-comment.c
@@ -23,10 +23,9 @@
 // CHECK: ![[bar]] = !{!" /bar=2"}
 // CHECK: ![[foo]] = !{!" /foo=\22foo bar\22"}
 
-// LINUX: !{!"-lmsvcrt.lib"}
-// LINUX: !{!"-lkernel32"}
-// LINUX: !{!"-lUSER32.LIB"}
-// LINUX: !{!" /bar=2"}
+// LINUX: !{!"lib", !"msvcrt.lib"}
+// LINUX: !{!"lib", !"kernel32"}
+// LINUX: !{!"lib", !"USER32.LIB"}
 
 // PS4: !{!"\01msvcrt.lib"}
 // PS4: !{!"\01kernel32"}
diff --git a/test/CodeGen/pragma-do-while.cpp b/test/CodeGen/pragma-do-while.cpp
new file mode 100644
index 000000000000..ecab7fc029e2
--- /dev/null
+++ b/test/CodeGen/pragma-do-while.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
+
+// We expect to get a loop structure like this:
+//    do.body:                                       ; preds = %do.cond, ...
+//      ...
+//      br label %do.cond
+//    do.cond:                                       ; preds = %do.body
+//      ...
+//      br i1 %cmp, label %do.body, label %do.end
+//    do.end:                                        ; preds = %do.cond
+//      ...
+//
+// Verify that the loop metadata only is put on the backedge.
+//
+// CHECK-NOT: llvm.loop
+// CHECK-LABEL: do.cond:
+// CHECK: br {{.*}}, label %do.body, label %do.end, !llvm.loop ![[LMD1:[0-9]+]]
+// CHECK-LABEL: do.end:
+// CHECK-NOT: llvm.loop
+// CHECK: ![[LMD1]] = distinct !{![[LMD1]], ![[LMD2:[0-9]+]]}
+// CHECK: ![[LMD2]] = !{!"llvm.loop.unroll.count", i32 4}
+
+int test(int a[], int n) {
+  int i = 0;
+  int sum = 0;
+
+#pragma unroll 4
+  do
+  {
+    a[i] = a[i] + 1;
+    sum = sum + a[i];
+    i++;
+  } while (i < n);
+
+  return sum;
+}
diff --git a/test/CodeGen/preserve-call-conv.c b/test/CodeGen/preserve-call-conv.c
index b67e29f392a4..7a84fa282314 100644
--- a/test/CodeGen/preserve-call-conv.c
+++ b/test/CodeGen/preserve-call-conv.c
@@ -3,18 +3,19 @@
 
 // RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-unknown-windows-msvc -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv7-unknown-windows-msvc -emit-llvm %s -o - | FileCheck %s
 
 // Check that the preserve_most calling convention attribute at the source level
 // is lowered to the corresponding calling convention attrribute at the LLVM IR
 // level.
 void foo() __attribute__((preserve_most)) {
-  // CHECK-LABEL: define preserve_mostcc void @foo()
+  // CHECK-LABEL: define {{(dso_local )?}}preserve_mostcc void @foo()
 }
 
 // Check that the preserve_most calling convention attribute at the source level
 // is lowered to the corresponding calling convention attrribute at the LLVM IR
 // level.
 void boo() __attribute__((preserve_all)) {
-  // CHECK-LABEL: define preserve_allcc void @boo()
+  // CHECK-LABEL: define {{(dso_local )?}}preserve_allcc void @boo()
 }
 
diff --git a/test/CodeGen/ptwrite.c b/test/CodeGen/ptwrite.c
new file mode 100644
index 000000000000..a34ed741b5ff
--- /dev/null
+++ b/test/CodeGen/ptwrite.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +ptwrite -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s --check-prefix=X86 --check-prefix=X86_64
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +ptwrite -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s --check-prefix=X86
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+void test_ptwrite32(uint32_t value) {
+  //X86-LABEL: @test_ptwrite32
+  //X86: call void @llvm.x86.ptwrite32(i32 %{{.*}})
+  _ptwrite32(value);
+}
+
+#ifdef __x86_64__
+
+void test_ptwrite64(uint64_t value) {
+  //X86_64-LABEL: @test_ptwrite64
+  //X86_64: call void @llvm.x86.ptwrite64(i64 %{{.*}})
+  _ptwrite64(value);
+}
+
+#endif /* __x86_64__ */
diff --git a/test/CodeGen/rdpid-builtins.c b/test/CodeGen/rdpid-builtins.c
new file mode 100644
index 000000000000..0816bd07ad1d
--- /dev/null
+++ b/test/CodeGen/rdpid-builtins.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -target-feature +rdpid -emit-llvm -o - %s | FileCheck %s
+
+
+#include <immintrin.h>
+
+unsigned int test_rdpid_u32(void) {
+// CHECK-LABEL: @test_rdpid_u32
+// CHECK: call i32 @llvm.x86.rdpid
+  return _rdpid_u32();
+}
diff --git a/test/CodeGen/rdrand-builtins.c b/test/CodeGen/rdrand-builtins.c
index 936502b77483..44a38a048e96 100644
--- a/test/CodeGen/rdrand-builtins.c
+++ b/test/CodeGen/rdrand-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -target-feature +rdrnd -target-feature +rdseed -emit-llvm -o - %s | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 int rdrand16(unsigned short *p) {
   return _rdrand16_step(p);
diff --git a/test/CodeGen/regcall.c b/test/CodeGen/regcall.c
index 350a82d33963..b7389904844a 100644
--- a/test/CodeGen/regcall.c
+++ b/test/CodeGen/regcall.c
@@ -6,34 +6,34 @@
 #include <xmmintrin.h>
 
 void __regcall v1(int a, int b) {}
-// Win32: define x86_regcallcc void @__regcall3__v1(i32 inreg %a, i32 inreg %b)
-// Win64: define x86_regcallcc void @__regcall3__v1(i32 %a, i32 %b)
+// Win32: define dso_local x86_regcallcc void @__regcall3__v1(i32 inreg %a, i32 inreg %b)
+// Win64: define dso_local x86_regcallcc void @__regcall3__v1(i32 %a, i32 %b)
 // Lin32: define x86_regcallcc void @__regcall3__v1(i32 inreg %a, i32 inreg %b)
 // Lin64: define x86_regcallcc void @__regcall3__v1(i32 %a, i32 %b)
 
 void __attribute__((regcall)) v1b(int a, int b) {}
-// Win32: define x86_regcallcc void @__regcall3__v1b(i32 inreg %a, i32 inreg %b)
-// Win64: define x86_regcallcc void @__regcall3__v1b(i32 %a, i32 %b)
+// Win32: define dso_local x86_regcallcc void @__regcall3__v1b(i32 inreg %a, i32 inreg %b)
+// Win64: define dso_local x86_regcallcc void @__regcall3__v1b(i32 %a, i32 %b)
 // Lin32: define x86_regcallcc void @__regcall3__v1b(i32 inreg %a, i32 inreg %b)
 // Lin64: define x86_regcallcc void @__regcall3__v1b(i32 %a, i32 %b)
 
 void __regcall v2(char a, char b) {}
-// Win32: define x86_regcallcc void @__regcall3__v2(i8 inreg signext %a, i8 inreg signext %b)
-// Win64: define x86_regcallcc void @__regcall3__v2(i8 %a, i8 %b)
+// Win32: define dso_local x86_regcallcc void @__regcall3__v2(i8 inreg signext %a, i8 inreg signext %b)
+// Win64: define dso_local x86_regcallcc void @__regcall3__v2(i8 %a, i8 %b)
 // Lin32: define x86_regcallcc void @__regcall3__v2(i8 inreg signext %a, i8 inreg signext %b)
 // Lin64: define x86_regcallcc void @__regcall3__v2(i8 signext %a, i8 signext %b)
 
 struct Small { int x; };
 void __regcall v3(int a, struct Small b, int c) {}
-// Win32: define x86_regcallcc void @__regcall3__v3(i32 inreg %a, i32 %b.0, i32 inreg %c)
-// Win64: define x86_regcallcc void @__regcall3__v3(i32 %a, i32 %b.coerce, i32 %c)
+// Win32: define dso_local x86_regcallcc void @__regcall3__v3(i32 inreg %a, i32 %b.0, i32 inreg %c)
+// Win64: define dso_local x86_regcallcc void @__regcall3__v3(i32 %a, i32 %b.coerce, i32 %c)
 // Lin32: define x86_regcallcc void @__regcall3__v3(i32 inreg %a, i32 inreg, i32 %b.0, i32 inreg %c)
 // Lin64: define x86_regcallcc void @__regcall3__v3(i32 %a, i32 %b.coerce, i32 %c)
 
 struct Large { int a[5]; };
 void __regcall v4(int a, struct Large b, int c) {}
-// Win32: define x86_regcallcc void @__regcall3__v4(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
-// Win64: define x86_regcallcc void @__regcall3__v4(i32 %a, %struct.Large* %b, i32 %c)
+// Win32: define dso_local x86_regcallcc void @__regcall3__v4(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
+// Win64: define dso_local x86_regcallcc void @__regcall3__v4(i32 %a, %struct.Large* %b, i32 %c)
 // Lin32: define x86_regcallcc void @__regcall3__v4(i32 inreg %a, %struct.Large* byval align 4 %b, i32 %c)
 // Lin64: define x86_regcallcc void @__regcall3__v4(i32 %a, [5 x i32] %b.coerce, i32 %c)
 
@@ -42,8 +42,8 @@ struct HFA4 { double w, x, y, z; };
 struct HFA5 { double v, w, x, y, z; };
 
 void __regcall hfa1(int a, struct HFA4 b, int c) {}
-// Win32: define x86_regcallcc void @__regcall3__hfa1(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
-// Win64: define x86_regcallcc void @__regcall3__hfa1(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c)
+// Win32: define dso_local x86_regcallcc void @__regcall3__hfa1(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hfa1(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c)
 // Lin32: define x86_regcallcc void @__regcall3__hfa1(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
 // Lin64: define x86_regcallcc void @__regcall3__hfa1(i32 %a, double %b.coerce0, double %b.coerce1, double %b.coerce2, double %b.coerce3, i32 %c)
 
@@ -51,16 +51,16 @@ void __regcall hfa1(int a, struct HFA4 b, int c) {}
 // indirectly. Additional vector arguments can consume the rest of the SSE
 // registers.
 void __regcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
-// Win32: define x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double* inreg)
-// Win64: define x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double %c)
+// Win32: define dso_local x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double* inreg)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double %c)
 // Lin32: define x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, double* inreg)
 // Lin64: define x86_regcallcc void @__regcall3__hfa2(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %b.coerce0, double %b.coerce1, double %b.coerce2, double %b.coerce3, double %c)
 
 // Ensure that we pass builtin types directly while counting them against the
 // SSE register usage.
 void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
-// Win32: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
-// Win64: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
+// Win32: define dso_local x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
 // Lin32: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.0, double %f.1)
 // Lin64: define x86_regcallcc void @__regcall3__hfa3(double %a, double %b, double %c, double %d, double %e, double %f.coerce0, double %f.coerce1)
 
@@ -68,16 +68,16 @@ void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA
 // Because they are not classified as homogeneous, they don't get special
 // handling to ensure alignment.
 void __regcall hfa4(struct HFA5 a) {}
-// Win32: define x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* byval align 4)
-// Win64: define x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* %a)
+// Win32: define dso_local x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* byval align 4)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* %a)
 // Lin32: define x86_regcallcc void @__regcall3__hfa4(%struct.HFA5* byval align 4 %a)
 // Lin64: define x86_regcallcc void @__regcall3__hfa4(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %a.coerce4)
 
 // Return HFAs of 4 or fewer elements in registers.
 static struct HFA2 g_hfa2;
 struct HFA2 __regcall hfa5(void) { return g_hfa2; }
-// Win32: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
-// Win64: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
+// Win32: define dso_local x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
+// Win64: define dso_local x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
 // Lin32: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
 // Lin64: define x86_regcallcc %struct.HFA2 @__regcall3__hfa5()
 
@@ -86,20 +86,20 @@ struct HVA2 { v4f32 x, y; };
 struct HVA4 { v4f32 w, x, y, z; };
 
 void __regcall hva1(int a, struct HVA4 b, int c) {}
-// Win32: define x86_regcallcc void @__regcall3__hva1(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
-// Win64: define x86_regcallcc void @__regcall3__hva1(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c)
+// Win32: define dso_local x86_regcallcc void @__regcall3__hva1(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hva1(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c)
 // Lin32: define x86_regcallcc void @__regcall3__hva1(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
 // Lin64: define x86_regcallcc void @__regcall3__hva1(i32 %a, <4 x float> %b.coerce0, <4 x float> %b.coerce1, <4 x float> %b.coerce2, <4 x float> %b.coerce3, i32 %c)
 
 void __regcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
-// Win32: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float>* inreg)
-// Win64: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c)
+// Win32: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float>* inreg)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c)
 // Lin32: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float>* inreg)
 // Lin64: define x86_regcallcc void @__regcall3__hva2(<4 x float> %a.coerce0, <4 x float> %a.coerce1, <4 x float> %a.coerce2, <4 x float> %a.coerce3, <4 x float> %b.coerce0, <4 x float> %b.coerce1, <4 x float> %b.coerce2, <4 x float> %b.coerce3, <4 x float> %c)
 
 void __regcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
-// Win32: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
-// Win64: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
+// Win32: define dso_local x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
 // Lin32: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.0, <4 x float> %f.1)
 // Lin64: define x86_regcallcc void @__regcall3__hva3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, <4 x float> %f.coerce0, <4 x float> %f.coerce1)
 
@@ -107,14 +107,14 @@ typedef float __attribute__((ext_vector_type(3))) v3f32;
 struct OddSizeHVA { v3f32 x, y; };
 
 void __regcall odd_size_hva(struct OddSizeHVA a) {}
-// Win32: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
-// Win64: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
+// Win32: define dso_local x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
+// Win64: define dso_local x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
 // Lin32: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
 // Lin64: define x86_regcallcc void @__regcall3__odd_size_hva(<3 x float> %a.coerce0, <3 x float> %a.coerce1)
 
 struct HFA6 { __m128 f[4]; };
 struct HFA6 __regcall ret_reg_reused(struct HFA6 a, struct HFA6 b, struct HFA6 c, struct HFA6 d){ struct HFA6 h; return h;}
-// Win32: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, %struct.HFA6* inreg %c, %struct.HFA6* inreg %d)
-// Win64: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3)
+// Win32: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, %struct.HFA6* inreg %c, %struct.HFA6* inreg %d)
+// Win64: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3)
 // Lin32: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, %struct.HFA6* inreg %c, %struct.HFA6* inreg %d)
 // Lin64: define x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused([4 x <4 x float>] %a.coerce, [4 x <4 x float>] %b.coerce, [4 x <4 x float>] %c.coerce, [4 x <4 x float>] %d.coerce)
diff --git a/test/CodeGen/riscv32-abi.c b/test/CodeGen/riscv32-abi.c
new file mode 100644
index 000000000000..04eceb3a8182
--- /dev/null
+++ b/test/CodeGen/riscv32-abi.c
@@ -0,0 +1,430 @@
+// RUN: %clang_cc1 -triple riscv32 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple riscv32 -emit-llvm -fforce-enable-int128 %s -o - \
+// RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-FORCEINT128
+
+#include <stddef.h>
+#include <stdint.h>
+
+// CHECK-LABEL: define void @f_void()
+void f_void(void) {}
+
+// Scalar arguments and return values smaller than the word size are extended
+// according to the sign of their type, up to 32 bits
+
+// CHECK-LABEL: define zeroext i1 @f_scalar_0(i1 zeroext %x)
+_Bool f_scalar_0(_Bool x) { return x; }
+
+// CHECK-LABEL: define signext i8 @f_scalar_1(i8 signext %x)
+int8_t f_scalar_1(int8_t x) { return x; }
+
+// CHECK-LABEL: define zeroext i8 @f_scalar_2(i8 zeroext %x)
+uint8_t f_scalar_2(uint8_t x) { return x; }
+
+// CHECK-LABEL: define i32 @f_scalar_3(i32 %x)
+int32_t f_scalar_3(int32_t x) { return x; }
+
+// CHECK-LABEL: define i64 @f_scalar_4(i64 %x)
+int64_t f_scalar_4(int64_t x) { return x; }
+
+#ifdef __SIZEOF_INT128__
+// CHECK-FORCEINT128-LABEL: define i128 @f_scalar_5(i128 %x)
+__int128_t f_scalar_5(__int128_t x) { return x; }
+#endif
+
+// CHECK-LABEL: define float @f_fp_scalar_1(float %x)
+float f_fp_scalar_1(float x) { return x; }
+
+// CHECK-LABEL: define double @f_fp_scalar_2(double %x)
+double f_fp_scalar_2(double x) { return x; }
+
+// Scalars larger than 2*xlen are passed/returned indirect. However, the
+// RISC-V LLVM backend can handle this fine, so the function doesn't need to
+// be modified.
+
+// CHECK-LABEL: define fp128 @f_fp_scalar_3(fp128 %x)
+long double f_fp_scalar_3(long double x) { return x; }
+
+// Empty structs or unions are ignored.
+
+struct empty_s {};
+
+// CHECK-LABEL: define void @f_agg_empty_struct()
+struct empty_s f_agg_empty_struct(struct empty_s x) {
+  return x;
+}
+
+union empty_u {};
+
+// CHECK-LABEL: define void @f_agg_empty_union()
+union empty_u f_agg_empty_union(union empty_u x) {
+  return x;
+}
+
+// Aggregates <= 2*xlen may be passed in registers, so will be coerced to
+// integer arguments. The rules for return are the same.
+
+struct tiny {
+  uint8_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_tiny(i32 %x.coerce)
+void f_agg_tiny(struct tiny x) {
+  x.a += x.b;
+  x.c += x.d;
+}
+
+// CHECK-LABEL: define i32 @f_agg_tiny_ret()
+struct tiny f_agg_tiny_ret() {
+  return (struct tiny){1, 2, 3, 4};
+}
+
+typedef uint8_t v4i8 __attribute__((vector_size(4)));
+typedef int32_t v1i32 __attribute__((vector_size(4)));
+
+// CHECK-LABEL: define void @f_vec_tiny_v4i8(i32 %x.coerce)
+void f_vec_tiny_v4i8(v4i8 x) {
+  x[0] = x[1];
+  x[2] = x[3];
+}
+
+// CHECK-LABEL: define i32 @f_vec_tiny_v4i8_ret()
+v4i8 f_vec_tiny_v4i8_ret() {
+  return (v4i8){1, 2, 3, 4};
+}
+
+// CHECK-LABEL: define void @f_vec_tiny_v1i32(i32 %x.coerce)
+void f_vec_tiny_v1i32(v1i32 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i32 @f_vec_tiny_v1i32_ret()
+v1i32 f_vec_tiny_v1i32_ret() {
+  return (v1i32){1};
+}
+
+struct small {
+  int32_t a, *b;
+};
+
+// CHECK-LABEL: define void @f_agg_small([2 x i32] %x.coerce)
+void f_agg_small(struct small x) {
+  x.a += *x.b;
+  x.b = &x.a;
+}
+
+// CHECK-LABEL: define [2 x i32] @f_agg_small_ret()
+struct small f_agg_small_ret() {
+  return (struct small){1, 0};
+}
+
+typedef uint8_t v8i8 __attribute__((vector_size(8)));
+typedef int64_t v1i64 __attribute__((vector_size(8)));
+
+// CHECK-LABEL: define void @f_vec_small_v8i8(i64 %x.coerce)
+void f_vec_small_v8i8(v8i8 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define i64 @f_vec_small_v8i8_ret()
+v8i8 f_vec_small_v8i8_ret() {
+  return (v8i8){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// CHECK-LABEL: define void @f_vec_small_v1i64(i64 %x.coerce)
+void f_vec_small_v1i64(v1i64 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i64 @f_vec_small_v1i64_ret()
+v1i64 f_vec_small_v1i64_ret() {
+  return (v1i64){1};
+}
+
+// Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
+// single 2*xlen-sized argument, to ensure that alignment can be maintained if
+// passed on the stack.
+
+struct small_aligned {
+  int64_t a;
+};
+
+// CHECK-LABEL: define void @f_agg_small_aligned(i64 %x.coerce)
+void f_agg_small_aligned(struct small_aligned x) {
+  x.a += x.a;
+}
+
+// CHECK-LABEL: define i64 @f_agg_small_aligned_ret(i64 %x.coerce)
+struct small_aligned f_agg_small_aligned_ret(struct small_aligned x) {
+  return (struct small_aligned){10};
+}
+
+// Aggregates greater > 2*xlen will be passed and returned indirectly
+struct large {
+  int32_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_large(%struct.large* %x)
+void f_agg_large(struct large x) {
+  x.a = x.b + x.c + x.d;
+}
+
+// The address where the struct should be written to will be the first
+// argument
+// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret %agg.result, i32 %i, i8 signext %j)
+struct large f_agg_large_ret(int32_t i, int8_t j) {
+  return (struct large){1, 2, 3, 4};
+}
+
+typedef unsigned char v16i8 __attribute__((vector_size(16)));
+
+// CHECK-LABEL: define void @f_vec_large_v16i8(<16 x i8>*)
+void f_vec_large_v16i8(v16i8 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define void @f_vec_large_v16i8_ret(<16 x i8>* noalias sret %agg.result)
+v16i8 f_vec_large_v16i8_ret() {
+  return (v16i8){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// Scalars passed on the stack should have signext/zeroext attributes (they
+// are anyext).
+
+// CHECK-LABEL: define i32 @f_scalar_stack_1(i32 %a.coerce, [2 x i32] %b.coerce, i64 %c.coerce, %struct.large* %d, i8 zeroext %e, i8 signext %f, i8 %g, i8 %h)
+int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
+                     struct large d, uint8_t e, int8_t f, uint8_t g, int8_t h) {
+  return g + h;
+}
+
+// CHECK-LABEL: define i32 @f_scalar_stack_2(i32 %a, i64 %b, float %c, double %d, fp128 %e, i8 zeroext %f, i8 %g, i8 %h)
+int f_scalar_stack_2(int32_t a, int64_t b, float c, double d, long double e,
+                     uint8_t f, int8_t g, uint8_t h) {
+  return g + h;
+}
+
+// Ensure that scalars passed on the stack are still determined correctly in
+// the presence of large return values that consume a register due to the need
+// to pass a pointer.
+
+// CHECK-LABEL: define void @f_scalar_stack_3(%struct.large* noalias sret %agg.result, i32 %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g)
+struct large f_scalar_stack_3(int32_t a, int64_t b, double c, long double d,
+                              uint8_t e, int8_t f, uint8_t g) {
+  return (struct large){a, e, f, g};
+}
+
+// CHECK-LABEL: define fp128 @f_scalar_stack_4(i32 %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g)
+long double f_scalar_stack_4(int32_t a, int64_t b, double c, long double d,
+                             uint8_t e, int8_t f, uint8_t g) {
+  return d;
+}
+
+// Aggregates and >=XLen scalars passed on the stack should be lowered just as
+// they would be if passed via registers.
+
+// CHECK-LABEL: define void @f_scalar_stack_5(double %a, i64 %b, double %c, i64 %d, i32 %e, i64 %f, float %g, double %h, fp128 %i)
+void f_scalar_stack_5(double a, int64_t b, double c, int64_t d, int e,
+                      int64_t f, float g, double h, long double i) {}
+
+// CHECK-LABEL: define void @f_agg_stack(double %a, i64 %b, double %c, i64 %d, i32 %e.coerce, [2 x i32] %f.coerce, i64 %g.coerce, %struct.large* %h)
+void f_agg_stack(double a, int64_t b, double c, int64_t d, struct tiny e,
+                 struct small f, struct small_aligned g, struct large h) {}
+
+// Ensure that ABI lowering happens as expected for vararg calls. For RV32
+// with the base integer calling convention there will be no observable
+// differences in the lowered IR for a call with varargs vs without.
+
+int f_va_callee(int, ...);
+
+// CHECK-LABEL: define void @f_va_caller()
+// CHECK: call i32 (i32, ...) @f_va_callee(i32 1, i32 2, i64 3, double 4.000000e+00, double 5.000000e+00, i32 {{%.*}}, [2 x i32] {{%.*}}, i64 {{%.*}}, %struct.large* {{%.*}})
+void f_va_caller() {
+  f_va_callee(1, 2, 3LL, 4.0f, 5.0, (struct tiny){6, 7, 8, 9},
+              (struct small){10, NULL}, (struct small_aligned){11},
+              (struct large){12, 13, 14, 15});
+}
+
+// CHECK-LABEL: define i32 @f_va_1(i8* %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK:   [[VA:%.*]] = alloca i8*, align 4
+// CHECK:   [[V:%.*]] = alloca i32, align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK:   [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK:   store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK:   [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[V]], align 4
+// CHECK:   ret i32 [[TMP2]]
+// CHECK: }
+int f_va_1(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// An "aligned" register pair (where the first register is even-numbered) is
+// used to pass varargs with 2x xlen alignment and 2x xlen size. Ensure the
+// correct offsets are used.
+
+// CHECK-LABEL: @f_va_2(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 7
+// CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -8
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i32 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
+// CHECK-NEXT:    store double [[TMP4]], double* [[V]], align 8
+// CHECK-NEXT:    [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA2]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[V]], align 8
+// CHECK-NEXT:    ret double [[TMP5]]
+double f_va_2(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  double v = __builtin_va_arg(va, double);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// Two "aligned" register pairs.
+
+// CHECK-LABEL: @f_va_3(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[X:%.*]] = alloca double, align 8
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 7
+// CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -8
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i32 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
+// CHECK-NEXT:    store double [[TMP4]], double* [[V]], align 8
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR2]] to i32*
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-NEXT:    store i32 [[TMP6]], i32* [[W]], align 4
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint i8* [[ARGP_CUR4]] to i32
+// CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 7
+// CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], -8
+// CHECK-NEXT:    [[ARGP_CUR4_ALIGNED:%.*]] = inttoptr i32 [[TMP9]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4_ALIGNED]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARGP_CUR4_ALIGNED]] to double*
+// CHECK-NEXT:    [[TMP11:%.*]] = load double, double* [[TMP10]], align 8
+// CHECK-NEXT:    store double [[TMP11]], double* [[X]], align 8
+// CHECK-NEXT:    [[VA6:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA6]])
+// CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[V]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = load double, double* [[X]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[TMP13]]
+// CHECK-NEXT:    ret double [[ADD]]
+double f_va_3(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  double v = __builtin_va_arg(va, double);
+  int w = __builtin_va_arg(va, int);
+  double x = __builtin_va_arg(va, double);
+  __builtin_va_end(va);
+
+  return v + x;
+}
+
+// CHECK-LABEL: define i32 @f_va_4(i8* %fmt, ...) {{.*}} {
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LD:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    [[TS:%.*]] = alloca [[STRUCT_TINY:%.*]], align 1
+// CHECK-NEXT:    [[SS:%.*]] = alloca [[STRUCT_SMALL:%.*]], align 4
+// CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
+// CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 4
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARGP_CUR2]] to fp128**
+// CHECK-NEXT:    [[TMP3:%.*]] = load fp128*, fp128** [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load fp128, fp128* [[TMP3]], align 16
+// CHECK-NEXT:    store fp128 [[TMP4]], fp128* [[LD]], align 16
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR4]] to %struct.tiny*
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.tiny* [[TS]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.tiny* [[TMP5]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[TMP6]], i8* align 4 [[TMP7]], i32 4, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR6:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT7:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR6]], i32 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT7]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[ARGP_CUR6]] to %struct.small*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.small* [[SS]] to i8*
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct.small* [[TMP8]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 [[TMP10]], i32 8, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR8:%.*]] = load i8*, i8** [[VA]], align 4
+// CHECK-NEXT:    [[ARGP_NEXT9:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR8]], i32 4
+// CHECK-NEXT:    store i8* [[ARGP_NEXT9]], i8** [[VA]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[ARGP_CUR8]] to %struct.large**
+// CHECK-NEXT:    [[TMP12:%.*]] = load %struct.large*, %struct.large** [[TMP11]], align 4
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast %struct.large* [[LS]] to i8*
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.large* [[TMP12]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 16, i1 false)
+// CHECK-NEXT:    [[VA10:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA10]])
+int f_va_4(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  long double ld = __builtin_va_arg(va, long double);
+  struct tiny ts = __builtin_va_arg(va, struct tiny);
+  struct small ss = __builtin_va_arg(va, struct small);
+  struct large ls = __builtin_va_arg(va, struct large);
+  __builtin_va_end(va);
+
+  int ret = (int)((long double)v + ld);
+  ret = ret + ts.a + ts.b + ts.c + ts.d;
+  ret = ret + ss.a + (int)ss.b;
+  ret = ret + ls.a + ls.b + ls.c + ls.d;
+
+  return ret;
+}
diff --git a/test/CodeGen/riscv64-abi.c b/test/CodeGen/riscv64-abi.c
new file mode 100644
index 000000000000..7a0f065fb4cf
--- /dev/null
+++ b/test/CodeGen/riscv64-abi.c
@@ -0,0 +1,422 @@
+// RUN: %clang_cc1 -triple riscv64 -emit-llvm %s -o - | FileCheck %s
+
+#include <stddef.h>
+#include <stdint.h>
+
+// CHECK-LABEL: define void @f_void()
+void f_void(void) {}
+
+// Scalar arguments and return values smaller than the word size are extended
+// according to the sign of their type, up to 32 bits
+
+// CHECK-LABEL: define zeroext i1 @f_scalar_0(i1 zeroext %x)
+_Bool f_scalar_0(_Bool x) { return x; }
+
+// CHECK-LABEL: define signext i8 @f_scalar_1(i8 signext %x)
+int8_t f_scalar_1(int8_t x) { return x; }
+
+// CHECK-LABEL: define zeroext i8 @f_scalar_2(i8 zeroext %x)
+uint8_t f_scalar_2(uint8_t x) { return x; }
+
+// CHECK-LABEL: define signext i32 @f_scalar_3(i32 signext %x)
+uint32_t f_scalar_3(int32_t x) { return x; }
+
+// CHECK-LABEL: define i64 @f_scalar_4(i64 %x)
+int64_t f_scalar_4(int64_t x) { return x; }
+
+// CHECK-LABEL: define float @f_fp_scalar_1(float %x)
+float f_fp_scalar_1(float x) { return x; }
+
+// CHECK-LABEL: define double @f_fp_scalar_2(double %x)
+double f_fp_scalar_2(double x) { return x; }
+
+// CHECK-LABEL: define fp128 @f_fp_scalar_3(fp128 %x)
+long double f_fp_scalar_3(long double x) { return x; }
+
+// Empty structs or unions are ignored.
+
+struct empty_s {};
+
+// CHECK-LABEL: define void @f_agg_empty_struct()
+struct empty_s f_agg_empty_struct(struct empty_s x) {
+  return x;
+}
+
+union empty_u {};
+
+// CHECK-LABEL: define void @f_agg_empty_union()
+union empty_u f_agg_empty_union(union empty_u x) {
+  return x;
+}
+
+// Aggregates <= 2*xlen may be passed in registers, so will be coerced to
+// integer arguments. The rules for return are the same.
+
+struct tiny {
+  uint16_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_tiny(i64 %x.coerce)
+void f_agg_tiny(struct tiny x) {
+  x.a += x.b;
+  x.c += x.d;
+}
+
+// CHECK-LABEL: define i64 @f_agg_tiny_ret()
+struct tiny f_agg_tiny_ret() {
+  return (struct tiny){1, 2, 3, 4};
+}
+
+typedef uint16_t v4i16 __attribute__((vector_size(8)));
+typedef int64_t v1i64 __attribute__((vector_size(8)));
+
+// CHECK-LABEL: define void @f_vec_tiny_v4i16(i64 %x.coerce)
+void f_vec_tiny_v4i16(v4i16 x) {
+  x[0] = x[1];
+  x[2] = x[3];
+}
+
+// CHECK-LABEL: define i64 @f_vec_tiny_v4i16_ret()
+v4i16 f_vec_tiny_v4i16_ret() {
+  return (v4i16){1, 2, 3, 4};
+}
+
+// CHECK-LABEL: define void @f_vec_tiny_v1i64(i64 %x.coerce)
+void f_vec_tiny_v1i64(v1i64 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i64 @f_vec_tiny_v1i64_ret()
+v1i64 f_vec_tiny_v1i64_ret() {
+  return (v1i64){1};
+}
+
+struct small {
+  int64_t a, *b;
+};
+
+// CHECK-LABEL: define void @f_agg_small([2 x i64] %x.coerce)
+void f_agg_small(struct small x) {
+  x.a += *x.b;
+  x.b = &x.a;
+}
+
+// CHECK-LABEL: define [2 x i64] @f_agg_small_ret()
+struct small f_agg_small_ret() {
+  return (struct small){1, 0};
+}
+
+typedef uint16_t v8i16 __attribute__((vector_size(16)));
+typedef __int128_t v1i128 __attribute__((vector_size(16)));
+
+// CHECK-LABEL: define void @f_vec_small_v8i16(i128 %x.coerce)
+void f_vec_small_v8i16(v8i16 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define i128 @f_vec_small_v8i16_ret()
+v8i16 f_vec_small_v8i16_ret() {
+  return (v8i16){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// CHECK-LABEL: define void @f_vec_small_v1i128(i128 %x.coerce)
+void f_vec_small_v1i128(v1i128 x) {
+  x[0] = 114;
+}
+
+// CHECK-LABEL: define i128 @f_vec_small_v1i128_ret()
+v1i128 f_vec_small_v1i128_ret() {
+  return (v1i128){1};
+}
+
+// Aggregates of 2*xlen size and 2*xlen alignment should be coerced to a
+// single 2*xlen-sized argument, to ensure that alignment can be maintained if
+// passed on the stack.
+
+struct small_aligned {
+  __int128_t a;
+};
+
+// CHECK-LABEL: define void @f_agg_small_aligned(i128 %x.coerce)
+void f_agg_small_aligned(struct small_aligned x) {
+  x.a += x.a;
+}
+
+// CHECK-LABEL: define i128 @f_agg_small_aligned_ret(i128 %x.coerce)
+struct small_aligned f_agg_small_aligned_ret(struct small_aligned x) {
+  return (struct small_aligned){10};
+}
+
+// Aggregates greater > 2*xlen will be passed and returned indirectly
+struct large {
+  int64_t a, b, c, d;
+};
+
+// CHECK-LABEL: define void @f_agg_large(%struct.large* %x)
+void f_agg_large(struct large x) {
+  x.a = x.b + x.c + x.d;
+}
+
+// The address where the struct should be written to will be the first
+// argument
+// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret %agg.result, i32 signext %i, i8 signext %j)
+struct large f_agg_large_ret(int32_t i, int8_t j) {
+  return (struct large){1, 2, 3, 4};
+}
+
+typedef unsigned char v32i8 __attribute__((vector_size(32)));
+
+// CHECK-LABEL: define void @f_vec_large_v32i8(<32 x i8>*)
+void f_vec_large_v32i8(v32i8 x) {
+  x[0] = x[7];
+}
+
+// CHECK-LABEL: define void @f_vec_large_v32i8_ret(<32 x i8>* noalias sret %agg.result)
+v32i8 f_vec_large_v32i8_ret() {
+  return (v32i8){1, 2, 3, 4, 5, 6, 7, 8};
+}
+
+// Scalars passed on the stack should have signext/zeroext attributes (they
+// are anyext).
+
+// CHECK-LABEL: define signext i32 @f_scalar_stack_1(i64 %a.coerce, [2 x i64] %b.coerce, i128 %c.coerce, %struct.large* %d, i8 zeroext %e, i8 signext %f, i8 %g, i8 %h)
+int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
+                     struct large d, uint8_t e, int8_t f, uint8_t g, int8_t h) {
+  return g + h;
+}
+
+// CHECK-LABEL: define signext i32 @f_scalar_stack_2(i32 signext %a, i128 %b, float %c, fp128 %d, <32 x i8>*, i8 zeroext %f, i8 %g, i8 %h)
+int f_scalar_stack_2(int32_t a, __int128_t b, float c, long double d, v32i8 e,
+                     uint8_t f, int8_t g, uint8_t h) {
+  return g + h;
+}
+
+// Ensure that scalars passed on the stack are still determined correctly in
+// the presence of large return values that consume a register due to the need
+// to pass a pointer.
+
+// CHECK-LABEL: define void @f_scalar_stack_3(%struct.large* noalias sret %agg.result, i32 signext %a, i128 %b, fp128 %c, <32 x i8>*, i8 zeroext %e, i8 %f, i8 %g)
+struct large f_scalar_stack_3(uint32_t a, __int128_t b, long double c, v32i8 d,
+                              uint8_t e, int8_t f, uint8_t g) {
+  return (struct large){a, e, f, g};
+}
+
+// Ensure that ABI lowering happens as expected for vararg calls.
+// Specifically, ensure that signext is emitted for varargs that will be
+// passed in registers but not on the stack. Ensure this takes into account
+// the use of "aligned" register pairs for varargs with 2*xlen alignment.
+
+int f_va_callee(int, ...);
+
+// CHECK-LABEL: define void @f_va_caller()
+void f_va_caller() {
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i64 3, double 4.000000e+00, double 5.000000e+00, i64 {{%.*}}, [2 x i64] {{%.*}}, i128 {{%.*}}, %struct.large* {{%.*}})
+  f_va_callee(1, 2, 3LL, 4.0f, 5.0, (struct tiny){6, 7, 8, 9},
+              (struct small){10, NULL}, (struct small_aligned){11},
+              (struct large){12, 13, 14, 15});
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, fp128 0xL00000000000000004001400000000000, i32 signext 6, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5.0L, 6, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i128 {{%.*}}, i32 signext 6, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, (struct small_aligned){5}, 6, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, [2 x i64] {{%.*}}, i32 signext 6, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, (struct small){5, NULL}, 6, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, fp128 0xL00000000000000004001800000000000, i32 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6.0L, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i128 {{%.*}}, i32 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, (struct small_aligned){6}, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, [2 x i64] {{%.*}}, i32 signext 7, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, (struct small){6, NULL}, 7, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i32 signext 6, fp128 0xL00000000000000004001C00000000000, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6, 7.0L, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i32 signext 6, i128 {{%.*}}, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6, (struct small_aligned){7}, 8, 9);
+  // CHECK: call signext i32 (i32, ...) @f_va_callee(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i32 signext 6, [2 x i64] {{.*}}, i32 8, i32 9)
+  f_va_callee(1, 2, 3, 4, 5, 6, (struct small){7, NULL}, 8, 9);
+}
+
+// CHECK-LABEL: define signext i32 @f_va_1(i8* %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK:   [[VA:%.*]] = alloca i8*, align 8
+// CHECK:   [[V:%.*]] = alloca i32, align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 8
+// CHECK:   [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK:   [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i64 8
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK:   [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 8
+// CHECK:   store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK:   [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[V]], align 4
+// CHECK:   ret i32 [[TMP2]]
+// CHECK: }
+int f_va_1(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// An "aligned" register pair (where the first register is even-numbered) is
+// used to pass varargs with 2x xlen alignment and 2x xlen size. Ensure the
+// correct offsets are used.
+
+// CHECK-LABEL: @f_va_2(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 8
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 15
+// CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -16
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i64 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to fp128*
+// CHECK-NEXT:    [[TMP4:%.*]] = load fp128, fp128* [[TMP3]], align 16
+// CHECK-NEXT:    store fp128 [[TMP4]], fp128* [[V]], align 16
+// CHECK-NEXT:    [[VA2:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA2]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load fp128, fp128* [[V]], align 16
+// CHECK-NEXT:    ret fp128 [[TMP5]]
+long double f_va_2(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  long double v = __builtin_va_arg(va, long double);
+  __builtin_va_end(va);
+
+  return v;
+}
+
+// Two "aligned" register pairs.
+
+// CHECK-LABEL: @f_va_3(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[X:%.*]] = alloca fp128, align 16
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 8
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[ARGP_CUR]] to i64
+// CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 15
+// CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -16
+// CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = inttoptr i64 [[TMP2]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to fp128*
+// CHECK-NEXT:    [[TMP4:%.*]] = load fp128, fp128* [[TMP3]], align 16
+// CHECK-NEXT:    store fp128 [[TMP4]], fp128* [[V]], align 16
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR2]] to i32*
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 8
+// CHECK-NEXT:    store i32 [[TMP6]], i32* [[W]], align 4
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint i8* [[ARGP_CUR4]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 15
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], -16
+// CHECK-NEXT:    [[ARGP_CUR4_ALIGNED:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4_ALIGNED]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARGP_CUR4_ALIGNED]] to fp128*
+// CHECK-NEXT:    [[TMP11:%.*]] = load fp128, fp128* [[TMP10]], align 16
+// CHECK-NEXT:    store fp128 [[TMP11]], fp128* [[X]], align 16
+// CHECK-NEXT:    [[VA6:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA6]])
+// CHECK-NEXT:    [[TMP12:%.*]] = load fp128, fp128* [[V]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = load fp128, fp128* [[X]], align 16
+// CHECK-NEXT:    [[ADD:%.*]] = fadd fp128 [[TMP12]], [[TMP13]]
+// CHECK-NEXT:    ret fp128 [[ADD]]
+long double f_va_3(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  long double v = __builtin_va_arg(va, long double);
+  int w = __builtin_va_arg(va, int);
+  long double x = __builtin_va_arg(va, long double);
+  __builtin_va_end(va);
+
+  return v + x;
+}
+
+// CHECK-LABEL: @f_va_4(
+// CHECK:         [[FMT_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VA:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TS:%.*]] = alloca [[STRUCT_TINY:%.*]], align 2
+// CHECK-NEXT:    [[SS:%.*]] = alloca [[STRUCT_SMALL:%.*]], align 8
+// CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 8
+// CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[FMT:%.*]], i8** [[FMT_ADDR]], align 8
+// CHECK-NEXT:    [[VA1:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_start(i8* [[VA1]])
+// CHECK-NEXT:    [[ARGP_CUR:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 8
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[V]], align 4
+// CHECK-NEXT:    [[ARGP_CUR2:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT3:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR2]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT3]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARGP_CUR2]] to %struct.tiny*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.tiny* [[TS]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.tiny* [[TMP2]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP3]], i8* align 8 [[TMP4]], i64 8, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4]], i64 16
+// CHECK-NEXT:    store i8* [[ARGP_NEXT5]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR4]] to %struct.small*
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.small* [[SS]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.small* [[TMP5]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
+// CHECK-NEXT:    [[ARGP_CUR6:%.*]] = load i8*, i8** [[VA]], align 8
+// CHECK-NEXT:    [[ARGP_NEXT7:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR6]], i64 8
+// CHECK-NEXT:    store i8* [[ARGP_NEXT7]], i8** [[VA]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[ARGP_CUR6]] to %struct.large**
+// CHECK-NEXT:    [[TMP9:%.*]] = load %struct.large*, %struct.large** [[TMP8]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast %struct.large* [[LS]] to i8*
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.large* [[TMP9]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 32, i1 false)
+// CHECK-NEXT:    [[VA8:%.*]] = bitcast i8** [[VA]] to i8*
+// CHECK-NEXT:    call void @llvm.va_end(i8* [[VA8]])
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TINY]], %struct.tiny* [[TS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[A]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP12]] to i64
+// CHECK-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_SMALL]], %struct.small* [[SS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[A9]], align 8
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[CONV]], [[TMP13]]
+// CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_LARGE]], %struct.large* [[LS]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[C]], align 8
+// CHECK-NEXT:    [[ADD10:%.*]] = add nsw i64 [[ADD]], [[TMP14]]
+// CHECK-NEXT:    [[CONV11:%.*]] = trunc i64 [[ADD10]] to i32
+// CHECK-NEXT:    store i32 [[CONV11]], i32* [[RET]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[RET]], align 4
+// CHECK-NEXT:    ret i32 [[TMP15]]
+int f_va_4(char *fmt, ...) {
+  __builtin_va_list va;
+
+  __builtin_va_start(va, fmt);
+  int v = __builtin_va_arg(va, int);
+  struct tiny ts = __builtin_va_arg(va, struct tiny);
+  struct small ss = __builtin_va_arg(va, struct small);
+  struct large ls = __builtin_va_arg(va, struct large);
+  __builtin_va_end(va);
+
+  int ret = ts.a + ss.a + ls.c;
+
+  return ret;
+}
diff --git a/test/CodeGen/shadowcallstack-attr.c b/test/CodeGen/shadowcallstack-attr.c
new file mode 100644
index 000000000000..45e710d87562
--- /dev/null
+++ b/test/CodeGen/shadowcallstack-attr.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=UNBLACKLISTED %s
+
+// RUN: %clang_cc1 -D ATTR -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLACKLISTED %s
+
+// RUN: echo -e "[shadow-call-stack]\nfun:foo" > %t
+// RUN: %clang_cc1 -fsanitize-blacklist=%t -triple x86_64-linux-unknown -emit-llvm -o - %s -fsanitize=shadow-call-stack | FileCheck -check-prefix=BLACKLISTED %s
+
+#ifdef ATTR
+__attribute__((no_sanitize("shadow-call-stack")))
+#endif
+int foo(int *a) { return *a; }
+
+// CHECK: define i32 @foo(i32* %a)
+
+// BLACKLISTED-NOT: attributes {{.*}}shadowcallstack{{.*}}
+// UNBLACKLISTED: attributes {{.*}}shadowcallstack{{.*}}
diff --git a/test/CodeGen/sparc-vaarg.c b/test/CodeGen/sparc-vaarg.c
index 3e4dd7c2c3f2..920b9a18b065 100644
--- a/test/CodeGen/sparc-vaarg.c
+++ b/test/CodeGen/sparc-vaarg.c
@@ -19,7 +19,7 @@ struct Foo dest;
 // CHECK-LABEL: define void @get_struct
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo*{{$}}
 // CHECK: [[RESULT2:%[a-z_0-9]+]] = bitcast {{.*}} [[RESULT]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[RESULT2]]
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* align {{[0-9]+}} [[RESULT2]]
 void get_struct(va_list *args) {
  dest = va_arg(*args, struct Foo);
 }
diff --git a/test/CodeGen/sparcv8-inline-asm.c b/test/CodeGen/sparcv8-inline-asm.c
index 711a2a0afbb0..8befe4556c84 100644
--- a/test/CodeGen/sparcv8-inline-asm.c
+++ b/test/CodeGen/sparcv8-inline-asm.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 
 // CHECK: define float @fabsf(float %a)
-// CHECK: %{{.*}} = call float asm sideeffect "fabss $1, $0;", "=e,f"(float %{{.*}}) #1
+// CHECK: %{{.*}} = call float asm sideeffect "fabss $1, $0;", "=e,f"(float %{{.*}})
 float fabsf(float a) {
   float res;
   __asm __volatile__("fabss  %1, %0;"
@@ -9,3 +9,34 @@ float fabsf(float a) {
                      : /* reg in */ "f"(a));
   return res;
 }
+
+void test_gcc_registers(void) {
+    register unsigned int regO6 asm("o6") = 0;
+    register unsigned int regSP asm("sp") = 1;
+    register unsigned int reg14 asm("r14") = 2;
+    register unsigned int regI6 asm("i6") = 3;
+    register unsigned int regFP asm("fp") = 4;
+    register unsigned int reg30 asm("r30") = 5;
+
+    register float fF20 asm("f20") = 8.0;
+    register double dF20 asm("f20") = 11.0;
+    register long double qF20 asm("f20") = 14.0;
+
+    // Test remapping register names in register ... asm("rN") statments.
+
+    // CHECK: call void asm sideeffect "add $0,$1,$2", "{r14},{r14},{r14}"
+    asm volatile("add %0,%1,%2" : : "r" (regO6), "r" (regSP), "r" (reg14));
+
+    // CHECK: call void asm sideeffect "add $0,$1,$2", "{r30},{r30},{r30}"
+    asm volatile("add %0,%1,%2" : : "r" (regI6), "r" (regFP), "r" (reg30));
+
+     // CHECK: call void asm sideeffect "fadds $0,$1,$2", "{f20},{f20},{f20}"
+    asm volatile("fadds %0,%1,%2" : : "f" (fF20), "f" (fF20), "f"(fF20));
+
+    // CHECK: call void asm sideeffect "faddd $0,$1,$2", "{f20},{f20},{f20}"
+    asm volatile("faddd %0,%1,%2" : : "f" (dF20), "f" (dF20), "f"(dF20));
+
+    // CHECK: call void asm sideeffect "faddq $0,$1,$2", "{f20},{f20},{f20}"
+    asm volatile("faddq %0,%1,%2" : : "f" (qF20), "f" (qF20), "f"(qF20));
+
+}
diff --git a/test/CodeGen/sparcv9-inline-asm.c b/test/CodeGen/sparcv9-inline-asm.c
new file mode 100644
index 000000000000..3a7ef0ea86e7
--- /dev/null
+++ b/test/CodeGen/sparcv9-inline-asm.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple sparcv9-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+void test_gcc_registers(void) {
+    register unsigned int regO6 asm("o6") = 0;
+    register unsigned int regSP asm("sp") = 1;
+    register unsigned int reg14 asm("r14") = 2;
+    register unsigned int regI6 asm("i6") = 3;
+    register unsigned int regFP asm("fp") = 4;
+    register unsigned int reg30 asm("r30") = 5;
+
+    register float fF20 asm("f20") = 8.0;
+    register double dF40 asm("f40") = 11.0;
+    register long double qF40 asm("f40") = 14.0;
+
+    // Test remapping register names in register ... asm("rN") statments.
+
+    // CHECK: call void asm sideeffect "add $0,$1,$2", "{r14},{r14},{r14}"
+    asm volatile("add %0,%1,%2" : : "r" (regO6), "r" (regSP), "r" (reg14));
+
+    // CHECK: call void asm sideeffect "add $0,$1,$2", "{r30},{r30},{r30}"
+    asm volatile("add %0,%1,%2" : : "r" (regI6), "r" (regFP), "r" (reg30));
+
+    // CHECK: call void asm sideeffect "fadds $0,$1,$2", "{f20},{f20},{f20}"
+    asm volatile("fadds %0,%1,%2" : : "f" (fF20), "f" (fF20), "f"(fF20));
+
+    // CHECK: call void asm sideeffect "faddd $0,$1,$2", "{f40},{f40},{f40}"
+    asm volatile("faddd %0,%1,%2" : : "f" (dF40), "f" (dF40), "f"(dF40));
+
+    // CHECK: call void asm sideeffect "faddq $0,$1,$2", "{f40},{f40},{f40}"
+    asm volatile("faddq %0,%1,%2" : : "f" (qF40), "f" (qF40), "f"(qF40));
+
+}
diff --git a/test/CodeGen/spir-half-type.cpp b/test/CodeGen/spir-half-type.cpp
new file mode 100644
index 000000000000..b60931fea6ed
--- /dev/null
+++ b/test/CodeGen/spir-half-type.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 -O0 -triple spir -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O0 -triple spir64 -emit-llvm %s -o - | FileCheck %s
+
+// This file tests that using the _Float16 type with the spir target will not
+// use the llvm intrinsics but instead will use the half arithmetic
+// instructions directly.
+
+// Previously attempting to use a constant _Float16 with a comparison
+// instruction when the target is spir or spir64 lead to an assert being hit.
+bool fcmp_const() {
+  _Float16 a = 0.0f16;
+  const _Float16 b = 1.0f16;
+
+  // CHECK-NOT: llvm.convert.to.fp16
+  // CHECK-NOT: llvm.convert.from.fp16
+
+  // CHECK: [[REG1:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp olt half [[REG1]], 0xH3C00
+
+  // CHECK: [[REG2:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp olt half [[REG2]], 0xH4000
+
+  // CHECK: [[REG3:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp ogt half [[REG3]], 0xH3C00
+
+  // CHECK: [[REG4:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp ogt half [[REG4]], 0xH4200
+
+  // CHECK: [[REG5:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp oeq half [[REG5]], 0xH3C00
+
+  // CHECK: [[REG7:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp oeq half [[REG7]], 0xH4400
+
+  // CHECK: [[REG8:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp une half [[REG8]], 0xH3C00
+
+  // CHECK: [[REG9:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp une half [[REG9]], 0xH4500
+
+  // CHECK: [[REG10:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp ole half [[REG10]], 0xH3C00
+
+  // CHECK: [[REG11:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp ole half [[REG11]], 0xH4600
+
+  // CHECK: [[REG12:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp oge half [[REG12]], 0xH3C00
+
+  // CHECK: [[REG13:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: fcmp oge half [[REG13]], 0xH4700
+  return a < b || a < 2.0f16 || a > b || a > 3.0f16 || a == b || a == 4.0f16 ||
+         a != b || a != 5.0f16 || a <= b || a <= 6.0f16 || a >= b ||
+         a >= 7.0f16;
+}
+
+bool fcmp() {
+  _Float16 a = 0.0f16;
+  _Float16 b = 1.0f16;
+
+  // CHECK-NOT: llvm.convert.to.fp16
+  // CHECK-NOT: llvm.convert.from.fp16
+  // CHECK: [[REG1:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG2:%.*]] = load half, half* %b, align 2
+  // CHECK-NEXT: fcmp olt half [[REG1]], [[REG2]]
+
+  // CHECK: [[REG3:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG4:%.*]] = load half, half* %b, align 2
+  // CHECK-NEXT: fcmp ogt half [[REG3]], [[REG4]]
+
+  // CHECK: [[REG5:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG6:%.*]] = load half, half* %b, align 2
+  // CHECK-NEXT: fcmp oeq half [[REG5]], [[REG6]]
+
+  // CHECK: [[REG7:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG8:%.*]] = load half, half* %b, align 2
+  // CHECK-NEXT: fcmp une half [[REG7]], [[REG8]]
+
+  // CHECK: [[REG7:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG8:%.*]] = load half, half* %b, align 2
+  // CHECK-NEXT: fcmp ole half [[REG7]], [[REG8]]
+
+  // CHECK: [[REG7:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG8:%.*]] = load half, half* %b, align 2
+  // CHECK-NEXT: fcmp oge half [[REG7]], [[REG8]]
+  return a < b || a > b || a == b || a != b || a <= b || a >= b;
+}
+
+_Float16 fadd() {
+  _Float16 a = 1.0f16;
+  const _Float16 b = 2.0f16;
+
+  // CHECK-NOT: llvm.convert.to.fp16
+  // CHECK-NOT: llvm.convert.from.fp16
+
+  // CHECK: [[REG1:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG2:%.*]] = fadd half [[REG1]], 0xH4000
+  // CHECK-NEXT: [[REG3:%.*]] = fadd half [[REG2]], 0xH4200
+  // CHECK-NEXT: ret half [[REG3]]
+  return a + b + 3.0f16;
+}
+
+_Float16 fsub() {
+  _Float16 a = 1.0f16;
+  const _Float16 b = 2.0f16;
+
+  // CHECK-NOT: llvm.convert.to.fp16
+  // CHECK-NOT: llvm.convert.from.fp16
+
+  // CHECK: [[REG1:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG2:%.*]] = fsub half [[REG1]], 0xH4000
+  // CHECK-NEXT: [[REG3:%.*]] = fsub half [[REG2]], 0xH4200
+  // CHECK-NEXT: ret half [[REG3]]
+  return a - b - 3.0f16;
+}
+
+// CHECK: define spir_func half @_Z4fmulDF16_(half %arg)
+_Float16 fmul(_Float16 arg) {
+  _Float16 a = 1.0f16;
+  const _Float16 b = 2.0f16;
+
+  // CHECK-NOT: llvm.convert.to.fp16
+  // CHECK-NOT: llvm.convert.from.fp16
+
+  // CHECK: [[REG1:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG2:%.*]] = load half, half* %arg.addr, align 2
+  // CHECK-NEXT: [[REG3:%.*]] = fmul half [[REG1]], [[REG2]]
+  // CHECK-NEXT: [[REG4:%.*]] = fmul half [[REG3]], 0xH4000
+  // CHECK-NEXT: [[REG5:%.*]] = fmul half [[REG4]], 0xH4200
+  // CHECK-NEXT: ret half [[REG5]]
+  return a * arg * b * 3.0f16;
+}
+
+_Float16 fdiv() {
+  _Float16 a = 1.0f16;
+  const _Float16 b = 2.0f16;
+
+  // CHECK-NOT: llvm.convert.to.fp16
+  // CHECK-NOT: llvm.convert.from.fp16
+
+  // CHECK: [[REG1:%.*]] = load half, half* %a, align 2
+  // CHECK-NEXT: [[REG2:%.*]] = fdiv half [[REG1]], 0xH4000
+  // CHECK-NEXT: [[REG3:%.*]] = fdiv half [[REG2]], 0xH4200
+  // CHECK-NEXT: ret half [[REG3]]
+  return a / b / 3.0f16;
+}
diff --git a/test/CodeGen/split-debug-filename.c b/test/CodeGen/split-debug-filename.c
index 9ca7b0f3287a..99f89a741450 100644
--- a/test/CodeGen/split-debug-filename.c
+++ b/test/CodeGen/split-debug-filename.c
@@ -1,5 +1,9 @@
+// REQUIRES: x86-registered-target
 // RUN: %clang_cc1 -debug-info-kind=limited -split-dwarf-file foo.dwo -S -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -debug-info-kind=limited -enable-split-dwarf -split-dwarf-file foo.dwo -S -emit-llvm -o - %s | FileCheck --check-prefix=VANILLA %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -debug-info-kind=limited -enable-split-dwarf -split-dwarf-file %t.dwo -emit-obj -o - %s | llvm-objdump -section-headers - | FileCheck --check-prefix=O %s
+// RUN: llvm-objdump -section-headers %t.dwo | FileCheck --check-prefix=DWO %s
+
 int main (void) {
   return 0;
 }
@@ -10,3 +14,6 @@ int main (void) {
 // Testing to ensure that the dwo name is not output into the compile unit if
 // it's for vanilla split-dwarf rather than split-dwarf for implicit modules.
 // VANILLA-NOT: splitDebugFilename
+
+// O-NOT: .dwo
+// DWO: .dwo
diff --git a/test/CodeGen/split-stacks.c b/test/CodeGen/split-stacks.c
index bf4cf0f026aa..d9ff1f955d2e 100644
--- a/test/CodeGen/split-stacks.c
+++ b/test/CodeGen/split-stacks.c
@@ -14,13 +14,13 @@ int main() {
   return foo();
 }
 
-// CHECK-SEGSTK: define i32 @foo() [[SS:#[0-9]+]] {
-// CHECK-SEGSTK: define i32 @nosplit() [[NSS:#[0-9]+]] {
-// CHECK-SEGSTK: define i32 @main() [[SS]] {
+// CHECK-SEGSTK: define dso_local i32 @foo() [[SS:#[0-9]+]] {
+// CHECK-SEGSTK: define dso_local i32 @nosplit() [[NSS:#[0-9]+]] {
+// CHECK-SEGSTK: define dso_local i32 @main() [[SS]] {
 // CHECK-SEGSTK-NOT: [[NSS]] = { {{.*}} "split-stack" {{.*}} }
 // CHECK-SEGSTK: [[SS]] = { {{.*}} "split-stack" {{.*}} }
 // CHECK-SEGSTK-NOT: [[NSS]] = { {{.*}} "split-stack" {{.*}} }
 
-// CHECK-NOSEGSTK: define i32 @foo() #0 {
-// CHECK-NOSEGSTK: define i32 @main() #0 {
+// CHECK-NOSEGSTK: define dso_local i32 @foo() #0 {
+// CHECK-NOSEGSTK: define dso_local i32 @main() #0 {
 // CHECK-NOSEGSTK-NOT: #0 = { {{.*}} "split-stack" {{.*}} }
diff --git a/test/CodeGen/sse-builtins.c b/test/CodeGen/sse-builtins.c
index 28b4f2cae1f0..e9801487be0d 100644
--- a/test/CodeGen/sse-builtins.c
+++ b/test/CodeGen/sse-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
 
@@ -450,7 +450,8 @@ __m128 test_mm_min_ss(__m128 A, __m128 B) {
 
 __m128 test_mm_move_ss(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_move_ss
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_move_ss(A, B);
 }
 
@@ -508,14 +509,6 @@ __m128 test_mm_rcp_ps(__m128 x) {
 __m128 test_mm_rcp_ss(__m128 x) {
   // CHECK-LABEL: test_mm_rcp_ss
   // CHECK: call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> {{.*}})
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
   return _mm_rcp_ss(x);
 }
 
@@ -528,14 +521,6 @@ __m128 test_mm_rsqrt_ps(__m128 x) {
 __m128 test_mm_rsqrt_ss(__m128 x) {
   // CHECK-LABEL: test_mm_rsqrt_ss
   // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> {{.*}})
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
   return _mm_rsqrt_ss(x);
 }
 
@@ -655,21 +640,15 @@ __m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
 
 __m128 test_mm_sqrt_ps(__m128 x) {
   // CHECK-LABEL: test_mm_sqrt_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> {{.*}})
+  // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> {{.*}})
   return _mm_sqrt_ps(x);
 }
 
 __m128 test_sqrt_ss(__m128 x) {
   // CHECK: define {{.*}} @test_sqrt_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  // CHECK: extractelement <4 x float> {{.*}}, i64 0
+  // CHECK: call float @llvm.sqrt.f32(float {{.*}})
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i64 0
   return _mm_sqrt_ss(x);
 }
 
diff --git a/test/CodeGen/sse2-builtins.c b/test/CodeGen/sse2-builtins.c
index c2279cb88109..fe7f7ccf83ee 100644
--- a/test/CodeGen/sse2-builtins.c
+++ b/test/CodeGen/sse2-builtins.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
 
@@ -121,13 +121,13 @@ __m128i test_mm_avg_epu16(__m128i A, __m128i B) {
 
 __m128i test_mm_bslli_si128(__m128i A) {
   // CHECK-LABEL: test_mm_bslli_si128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+  // CHECK: shufflevector <16 x i8> zeroinitializer, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
   return _mm_bslli_si128(A, 5);
 }
 
 __m128i test_mm_bsrli_si128(__m128i A) {
   // CHECK-LABEL: test_mm_bsrli_si128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
   return _mm_bsrli_si128(A, 5);
 }
 
@@ -468,7 +468,7 @@ __m128d test_mm_cvtepi32_pd(__m128i A) {
 
 __m128 test_mm_cvtepi32_ps(__m128i A) {
   // CHECK-LABEL: test_mm_cvtepi32_ps
-  // CHECK: call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %{{.*}})
+  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x float>
   return _mm_cvtepi32_ps(A);
 }
 
@@ -613,17 +613,15 @@ __m128d test_mm_div_sd(__m128d A, __m128d B) {
 // Lowering to pextrw requires optimization.
 int test_mm_extract_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_extract_epi16
-  // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
-  // CHECK: extractelement <8 x i16> %{{.*}}, i32 [[x]]
+  // CHECK: extractelement <8 x i16> %{{.*}}, {{i32|i64}} 1
   // CHECK: zext i16 %{{.*}} to i32
-  return _mm_extract_epi16(A, 9);
+  return _mm_extract_epi16(A, 1);
 }
 
 __m128i test_mm_insert_epi16(__m128i A, int B) {
   // CHECK-LABEL: test_mm_insert_epi16
-  // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
-  // CHECK: insertelement <8 x i16> %{{.*}}, i32 [[x]]
-  return _mm_insert_epi16(A, B, 8);
+  // CHECK: insertelement <8 x i16> %{{.*}}, {{i32|i64}} 0
+  return _mm_insert_epi16(A, B, 0);
 }
 
 void test_mm_lfence() {
@@ -796,9 +794,7 @@ __m128i test_mm_move_epi64(__m128i A) {
 __m128d test_mm_move_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_move_sd
   // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_move_sd(A, B);
 }
 
@@ -816,7 +812,9 @@ int test_mm_movemask_pd(__m128d A) {
 
 __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_mul_epu32
-  // CHECK: call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epu32(A, B);
 }
 
@@ -1116,7 +1114,7 @@ __m128i test_mm_setzero_si128() {
 
 __m128i test_mm_shuffle_epi32(__m128i A) {
   // CHECK-LABEL: test_mm_shuffle_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> zeroinitializer
   return _mm_shuffle_epi32(A, 0);
 }
 
@@ -1128,13 +1126,13 @@ __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
 
 __m128i test_mm_shufflehi_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_shufflehi_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
   return _mm_shufflehi_epi16(A, 0);
 }
 
 __m128i test_mm_shufflelo_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_shufflelo_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   return _mm_shufflelo_epi16(A, 0);
 }
 
@@ -1176,29 +1174,27 @@ __m128i test_mm_slli_epi64(__m128i A) {
 
 __m128i test_mm_slli_si128(__m128i A) {
   // CHECK-LABEL: test_mm_slli_si128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+  // CHECK: shufflevector <16 x i8> zeroinitializer, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
   return _mm_slli_si128(A, 5);
 }
 
 __m128i test_mm_slli_si128_2(__m128i A) {
   // CHECK-LABEL: test_mm_slli_si128_2
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: ret <2 x i64> zeroinitializer
   return _mm_slli_si128(A, 17);
 }
 
 __m128d test_mm_sqrt_pd(__m128d A) {
   // CHECK-LABEL: test_mm_sqrt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %{{.*}})
+  // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}})
   return _mm_sqrt_pd(A);
 }
 
 __m128d test_mm_sqrt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_sqrt_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %{{.*}})
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: call double @llvm.sqrt.f64(double {{.*}})
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_sqrt_sd(A, B);
 }
 
@@ -1264,13 +1260,13 @@ __m128i test_mm_srli_epi64(__m128i A) {
 
 __m128i test_mm_srli_si128(__m128i A) {
   // CHECK-LABEL: test_mm_srli_si128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
   return _mm_srli_si128(A, 5);
 }
 
 __m128i test_mm_srli_si128_2(__m128i A) {
   // CHECK-LABEL: test_mm_srli_si128_2
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  // ret <2 x i64> zeroinitializer
   return _mm_srli_si128(A, 17);
 }
 
diff --git a/test/CodeGen/sse3-builtins.c b/test/CodeGen/sse3-builtins.c
index 46a7bbbb91e2..cf3e9526c377 100644
--- a/test/CodeGen/sse3-builtins.c
+++ b/test/CodeGen/sse3-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
 
diff --git a/test/CodeGen/sse41-builtins.c b/test/CodeGen/sse41-builtins.c
index b48b73ec18d9..2d9da07330dc 100644
--- a/test/CodeGen/sse41-builtins.c
+++ b/test/CodeGen/sse41-builtins.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
 
@@ -171,26 +171,26 @@ __m128 test_mm_dp_ps(__m128 x, __m128 y) {
 
 int test_mm_extract_epi8(__m128i x) {
   // CHECK-LABEL: test_mm_extract_epi8
-  // CHECK: extractelement <16 x i8> %{{.*}}, i32 1
+  // CHECK: extractelement <16 x i8> %{{.*}}, {{i32|i64}} 1
   // CHECK: zext i8 %{{.*}} to i32
   return _mm_extract_epi8(x, 1);
 }
 
 int test_mm_extract_epi32(__m128i x) {
   // CHECK-LABEL: test_mm_extract_epi32
-  // CHECK: extractelement <4 x i32> %{{.*}}, i32 1
+  // CHECK: extractelement <4 x i32> %{{.*}}, {{i32|i64}} 1
   return _mm_extract_epi32(x, 1);
 }
 
 long long test_mm_extract_epi64(__m128i x) {
   // CHECK-LABEL: test_mm_extract_epi64
-  // CHECK: extractelement <2 x i64> %{{.*}}, i32 1
+  // CHECK: extractelement <2 x i64> %{{.*}}, {{i32|i64}} 1
   return _mm_extract_epi64(x, 1);
 }
 
 int test_mm_extract_ps(__m128 x) {
   // CHECK-LABEL: test_mm_extract_ps
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 1
+  // CHECK: extractelement <4 x float> %{{.*}}, {{i32|i64}} 1
   return _mm_extract_ps(x, 1);
 }
 
@@ -220,20 +220,20 @@ __m128 test_mm_floor_ss(__m128 x, __m128 y) {
 
 __m128i test_mm_insert_epi8(__m128i x, char b) {
   // CHECK-LABEL: test_mm_insert_epi8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 0
-  return _mm_insert_epi8(x, b, 16);
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, {{i32|i64}} 1
+  return _mm_insert_epi8(x, b, 1);
 }
 
 __m128i test_mm_insert_epi32(__m128i x, int b) {
   // CHECK-LABEL: test_mm_insert_epi32
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 0
-  return _mm_insert_epi32(x, b, 4);
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, {{i32|i64}} 1
+  return _mm_insert_epi32(x, b, 1);
 }
 
 __m128i test_mm_insert_epi64(__m128i x, long long b) {
   // CHECK-LABEL: test_mm_insert_epi64
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 0
-  return _mm_insert_epi64(x, b, 2);
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, {{i32|i64}} 1
+  return _mm_insert_epi64(x, b, 1);
 }
 
 __m128 test_mm_insert_ps(__m128 x, __m128 y) {
@@ -312,7 +312,11 @@ __m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
 
 __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mul_epi32
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
+  // CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
+  // CHECK: shl <2 x i64> %{{.*}}, <i64 32, i64 32>
+  // CHECK: ashr <2 x i64> %{{.*}}, <i64 32, i64 32>
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epi32(x, y);
 }
 
diff --git a/test/CodeGen/sse42-builtins.c b/test/CodeGen/sse42-builtins.c
index 523db1a3416e..d94d12452df7 100644
--- a/test/CodeGen/sse42-builtins.c
+++ b/test/CodeGen/sse42-builtins.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
 
diff --git a/test/CodeGen/ssse3-builtins.c b/test/CodeGen/ssse3-builtins.c
index 4fd22aa79b46..4e6d401772be 100644
--- a/test/CodeGen/ssse3-builtins.c
+++ b/test/CodeGen/ssse3-builtins.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
 
diff --git a/test/CodeGen/stack-arg-probe.c b/test/CodeGen/stack-arg-probe.c
new file mode 100644
index 000000000000..d806db61c369
--- /dev/null
+++ b/test/CodeGen/stack-arg-probe.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 %s -triple=i686-windows-msvc -emit-llvm -o - -mno-stack-arg-probe | FileCheck %s -check-prefix=NO-STACKPROBE
+// RUN: %clang_cc1 %s -triple=i686-windows-msvc -emit-llvm -o - | FileCheck %s -check-prefix=STACKPROBE
+
+// NO-STACKPROBE: attributes #{{[0-9]+}} = {{{.*}} "no-stack-arg-probe"
+// STACKPROBE-NOT: attributes #{{[0-9]+}} = {{{.*}} "no-stack-arg-probe"
+
+void test1() {
+}
diff --git a/test/CodeGen/stack-protector.c b/test/CodeGen/stack-protector.c
index 7a45a2f4acf6..5037a3e0d6fe 100644
--- a/test/CodeGen/stack-protector.c
+++ b/test/CodeGen/stack-protector.c
@@ -22,6 +22,14 @@ void test1(const char *msg) {
   printf("%s\n", a);
 }
 
+// DEF: define {{.*}}void @test2(i8* %msg) #[[B:.*]] {
+__attribute__((no_stack_protector))
+void test2(const char *msg) {
+  char a[strlen(msg) + 1];
+  strcpy(a, msg);
+  printf("%s\n", a);
+}
+
 // NOSSP-NOT: attributes #[[A]] = {{.*}} ssp
 // SSP: attributes #[[A]] = {{.*}} ssp{{ }}
 // SSPSTRONG: attributes #[[A]] = {{.*}} sspstrong
@@ -33,3 +41,15 @@ void test1(const char *msg) {
 // SAFESTACK-SSP: attributes #[[A]] = {{.*}} safestack ssp{{ }}
 // SAFESTACK-SSPSTRONG: attributes #[[A]] = {{.*}} safestack sspstrong
 // SAFESTACK-SSPREQ: attributes #[[A]] = {{.*}} safestack sspreq
+
+// NOSSP-NOT: attributes #[[B]] = {{.*}} ssp
+// SSP-NOT: attributes #[[B]] = {{.*}} ssp{{ }}
+// SSPSTRONG-NOT: attributes #[[B]] = {{.*}} sspstrong
+// SSPREQ-NOT: attributes #[[B]] = {{.*}} sspreq
+
+// SAFESTACK-SSP: attributes #[[B]] = {{.*}} safestack
+// SAFESTACK-SSP-NOT: attributes #[[B]] = {{.*}} safestack ssp{{ }}
+// SAFESTACK-SSPSTRONG: attributes #[[B]] = {{.*}} safestack
+// SAFESTACK-SSPSTRONG-NOT: attributes #[[B]] = {{.*}} safestack sspstrong
+// SAFESTACK-SSPREQ: attributes #[[B]] = {{.*}} safestack
+// SAFESTACK-SSPREQ-NOT: attributes #[[B]] = {{.*}} safestack sspreq
diff --git a/test/CodeGen/stack-size-section.c b/test/CodeGen/stack-size-section.c
new file mode 100644
index 000000000000..504a42de1341
--- /dev/null
+++ b/test/CodeGen/stack-size-section.c
@@ -0,0 +1,9 @@
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown %s -S -o - | FileCheck %s --check-prefix=CHECK-ABSENT
+// CHECK-ABSENT-NOT: section .stack_sizes
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fstack-size-section %s -S -o - | FileCheck %s --check-prefix=CHECK-PRESENT
+// CHECK-PRESENT: section .stack_sizes
+
+int foo() { return 42; }
diff --git a/test/CodeGen/string-literal-short-wstring.c b/test/CodeGen/string-literal-short-wstring.c
index fb1fe0cad0a8..8894b8823e91 100644
--- a/test/CodeGen/string-literal-short-wstring.c
+++ b/test/CodeGen/string-literal-short-wstring.c
@@ -12,12 +12,12 @@ int main() {
   char b[10] = "\u1120\u0220\U00102030";
 
   // ITANIUM: private unnamed_addr constant [3 x i16] [i16 65, i16 66, i16 0]
-  // MSABI: linkonce_odr unnamed_addr constant [3 x i16] [i16 65, i16 66, i16 0]
+  // MSABI: linkonce_odr dso_local unnamed_addr constant [3 x i16] [i16 65, i16 66, i16 0]
   const wchar_t *foo = L"AB";
 
   // This should convert to utf16.
   // ITANIUM: private unnamed_addr constant [5 x i16] [i16 4384, i16 544, i16 -9272, i16 -9168, i16 0]
-  // MSABI: linkonce_odr unnamed_addr constant [5 x i16] [i16 4384, i16 544, i16 -9272, i16 -9168, i16 0]
+  // MSABI: linkonce_odr dso_local unnamed_addr constant [5 x i16] [i16 4384, i16 544, i16 -9272, i16 -9168, i16 0]
   const wchar_t *bar = L"\u1120\u0220\U00102030";
 
 
diff --git a/test/CodeGen/target-data.c b/test/CodeGen/target-data.c
index 3869afec7858..2b990910920c 100644
--- a/test/CodeGen/target-data.c
+++ b/test/CodeGen/target-data.c
@@ -124,20 +124,20 @@
 
 // RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=R600
-// R600: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 // RUN: %clang_cc1 -triple r600-unknown -target-cpu cayman -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600D
-// R600D: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600D: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600SI: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 // Test default -target-cpu
 // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600SIDefault: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/test/CodeGen/target-features-error-2.c b/test/CodeGen/target-features-error-2.c
index 683d9ab99ef6..60586fb57f1c 100644
--- a/test/CodeGen/target-features-error-2.c
+++ b/test/CodeGen/target-features-error-2.c
@@ -1,38 +1,46 @@
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_SSE42
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_1
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_2
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_3
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_4
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX512f
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +movdir64b -S -verify -o - -D NEED_MOVDIRI
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512vnni -target-feature +movdiri -S -verify -o - -D NEED_CLWB
 
 #define __MM_MALLOC_H
 #include <x86intrin.h>
 
-#if NEED_SSE42
+#if NEED_AVX_1
 int baz(__m256i a) {
-  return _mm256_extract_epi32(a, 3); // expected-error {{always_inline function '_mm256_extract_epi32' requires target feature 'sse4.2', but would be inlined into function 'baz' that is compiled without support for 'sse4.2'}}
+  return _mm256_extract_epi32(a, 3); // expected-error {{'__builtin_ia32_vec_ext_v8si' needs target feature avx}}
 }
 #endif
 
-#if NEED_AVX_1
+#if NEED_AVX_2
 __m128 need_avx(__m128 a, __m128 b) {
   return _mm_cmp_ps(a, b, 0); // expected-error {{'__builtin_ia32_cmpps' needs target feature avx}}
 }
 #endif
 
-#if NEED_AVX_2
-__m128 need_avx(__m128 a, __m128 b) {
-  return _mm_cmp_ss(a, b, 0); // expected-error {{'__builtin_ia32_cmpss' needs target feature avx}}
+#if NEED_AVX512f
+unsigned short need_avx512f(unsigned short a, unsigned short b) {
+  return __builtin_ia32_korhi(a, b); // expected-error {{'__builtin_ia32_korhi' needs target feature avx512f}}
 }
 #endif
 
-#if NEED_AVX_3
-__m128d need_avx(__m128d a, __m128d b) {
-  return _mm_cmp_pd(a, b, 0); // expected-error {{'__builtin_ia32_cmppd' needs target feature avx}}
+#if NEED_MOVDIRI
+void need_movdiri(unsigned int *a, unsigned int b) {
+  __builtin_ia32_directstore_u32(a, b); // expected-error {{'__builtin_ia32_directstore_u32' needs target feature movdiri}}
 }
 #endif
 
-#if NEED_AVX_4
-__m128d need_avx(__m128d a, __m128d b) {
-  return _mm_cmp_sd(a, b, 0); // expected-error {{'__builtin_ia32_cmpsd' needs target feature avx}}
+#if NEED_CLWB
+static __inline__ void
+ __attribute__((__always_inline__, __nodebug__,  __target__("avx512vnni,clwb,movdiri,movdir64b")))
+ func(unsigned int *a, unsigned int b)
+{
+  __builtin_ia32_directstore_u32(a, b);
+}
+
+void need_clwb(unsigned int *a, unsigned int b) {
+  func(a, b); // expected-error {{always_inline function 'func' requires target feature 'clwb', but would be inlined into function 'need_clwb' that is compiled without support for 'clwb'}}
+
 }
 #endif
diff --git a/test/CodeGen/target-features-error.c b/test/CodeGen/target-features-error.c
index 518f6e6189ed..a55163a95d43 100644
--- a/test/CodeGen/target-features-error.c
+++ b/test/CodeGen/target-features-error.c
@@ -3,6 +3,5 @@ int __attribute__((target("avx"), always_inline)) foo(int a) {
   return a + 4;
 }
 int bar() {
-  return foo(4); // expected-error {{always_inline function 'foo' requires target feature 'sse4.2', but would be inlined into function 'bar' that is compiled without support for 'sse4.2'}}
+  return foo(4); // expected-error {{always_inline function 'foo' requires target feature 'avx', but would be inlined into function 'bar' that is compiled without support for 'avx'}}
 }
-
diff --git a/test/CodeGen/tbaa-base.cpp b/test/CodeGen/tbaa-base.cpp
new file mode 100644
index 000000000000..bcd4732df8f8
--- /dev/null
+++ b/test/CodeGen/tbaa-base.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -O1 %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -O1 %s -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=CHECK,NEW-PATH
+//
+// Test generating of TBAA metadata for accesses to members of base classes.
+
+struct A {
+  int x, y, z;
+};
+
+struct B : A {
+  int i;
+};
+
+struct C {
+  int i;
+  B b;
+  int j;
+};
+
+int f1(B *b) {
+// CHECK-LABEL: _Z2f1P1B
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y:!.*]]
+  return b->y;
+}
+
+int f2(C *c) {
+// CHECK-LABEL: _Z2f2P1C
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y]]
+  return (&(c->b))->y;
+}
+
+struct D : virtual A
+{};
+
+struct E {
+  D d;
+};
+
+int f3(D *d) {
+// CHECK-LABEL: _Z2f3P1D
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y]]
+  return d->y;
+}
+
+int f4(E *e) {
+// CHECK-LABEL: _Z2f4P1E
+// CHECK: load i32, {{.*}}, !tbaa [[TAG_A_y]]
+  return (&(e->d))->y;
+}
+
+// OLD-PATH-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0}
+// OLD-PATH-DAG: [[TYPE_int:!.*]] = !{!"int", [[TYPE_char]], i64 0}
+// OLD-PATH-DAG: [[TYPE_A:!.*]] = !{!"_ZTS1A", [[TYPE_int]], i64 0, [[TYPE_int]], i64 4, [[TYPE_int]], i64 8}
+// OLD-PATH-DAG: [[TAG_A_y]] = !{[[TYPE_A]], [[TYPE_int]], i64 4}
+// NEW-PATH-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
+// NEW-PATH-DAG: [[TYPE_int:!.*]] = !{[[TYPE_char]], i64 4, !"int"}
+// NEW-PATH-DAG: [[TYPE_A:!.*]] = !{[[TYPE_char]], i64 12, !"_ZTS1A", [[TYPE_int]], i64 0, i64 4, [[TYPE_int]], i64 4, i64 4, [[TYPE_int]], i64 8, i64 4}
+// NEW-PATH-DAG: [[TAG_A_y]] = !{[[TYPE_A]], [[TYPE_int]], i64 4, i64 4}
diff --git a/test/CodeGen/tbaa-cast.cpp b/test/CodeGen/tbaa-cast.cpp
index 2b9e31086664..fee2f30bccc1 100644
--- a/test/CodeGen/tbaa-cast.cpp
+++ b/test/CodeGen/tbaa-cast.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s \
-// RUN:     -emit-llvm -o - | FileCheck %s
+// RUN:     -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s \
+// RUN:     -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=CHECK,NEW-PATH
 //
 // Check that we generate correct TBAA information for lvalues constructed
 // with use of casts.
@@ -18,6 +20,9 @@ void foo(S *p) {
   ((V*)p->bytes)->n = 5;
 }
 
-// CHECK-DAG: [[TAG_V_n]] = !{[[TYPE_V:!.*]], [[TYPE_int:!.*]], i64 0}
-// CHECK-DAG: [[TYPE_V]] = !{!"_ZTS1V", !{{.*}}, i64 0}
-// CHECK-DAG: [[TYPE_int]] = !{!"int", !{{.*}}, i64 0}
+// OLD-PATH-DAG: [[TAG_V_n]] = !{[[TYPE_V:!.*]], [[TYPE_int:!.*]], i64 0}
+// OLD-PATH-DAG: [[TYPE_V]] = !{!"_ZTS1V", !{{.*}}, i64 0}
+// OLD-PATH-DAG: [[TYPE_int]] = !{!"int", !{{.*}}, i64 0}
+// NEW-PATH-DAG: [[TAG_V_n]] = !{[[TYPE_V:!.*]], [[TYPE_int:!.*]], i64 0, i64 4}
+// NEW-PATH-DAG: [[TYPE_V]] = !{[[TYPE_char:!.*]], i64 4, !"_ZTS1V", [[TYPE_int]], i64 0, i64 4}
+// NEW-PATH-DAG: [[TYPE_int]] = !{[[TYPE_char]], i64 4, !"int"}
diff --git a/test/CodeGen/tbaa-class.cpp b/test/CodeGen/tbaa-class.cpp
index 9b614a19e8b0..b7d2a865451a 100644
--- a/test/CodeGen/tbaa-class.cpp
+++ b/test/CodeGen/tbaa-class.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH,OLD-PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=PATH,NEW-PATH
 // Test TBAA metadata generated by front-end.
 
 typedef unsigned char uint8_t;
@@ -205,24 +206,46 @@ uint32_t g12(StructC *C, StructD *D, uint64_t count) {
 // CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
 // CHECK: [[TYPE_i16]] = !{!"short", [[TYPE_char]],
 
-// PATH: [[TYPE_CHAR:!.*]] = !{!"omnipotent char", !
-// PATH: [[TAG_i32]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
-// PATH: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR]]
-// PATH: [[TAG_A_f32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 4}
-// PATH: [[TYPE_A]] = !{!"_ZTS7StructA", [[TYPE_SHORT:!.*]], i64 0, [[TYPE_INT]], i64 4, [[TYPE_SHORT]], i64 8, [[TYPE_INT]], i64 12}
-// PATH: [[TYPE_SHORT:!.*]] = !{!"short", [[TYPE_CHAR]]
-// PATH: [[TAG_A_f16]] = !{[[TYPE_A]], [[TYPE_SHORT]], i64 0}
-// PATH: [[TAG_B_a_f32]] = !{[[TYPE_B:!.*]], [[TYPE_INT]], i64 8}
-// PATH: [[TYPE_B]] = !{!"_ZTS7StructB", [[TYPE_SHORT]], i64 0, [[TYPE_A]], i64 4, [[TYPE_INT]], i64 20}
-// PATH: [[TAG_B_a_f16]] = !{[[TYPE_B]], [[TYPE_SHORT]], i64 4}
-// PATH: [[TAG_B_f32]] = !{[[TYPE_B]], [[TYPE_INT]], i64 20}
-// PATH: [[TAG_B_a_f32_2]] = !{[[TYPE_B]], [[TYPE_INT]], i64 16}
-// PATH: [[TAG_S_f32]] = !{[[TYPE_S:!.*]], [[TYPE_INT]], i64 4}
-// PATH: [[TYPE_S]] = !{!"_ZTS7StructS", [[TYPE_SHORT]], i64 0, [[TYPE_INT]], i64 4}
-// PATH: [[TAG_S_f16]] = !{[[TYPE_S]], [[TYPE_SHORT]], i64 0}
-// PATH: [[TAG_S2_f32_2]] = !{[[TYPE_S2:!.*]], [[TYPE_INT]], i64 12}
-// PATH: [[TYPE_S2]] = !{!"_ZTS8StructS2", [[TYPE_SHORT]], i64 8, [[TYPE_INT]], i64 12}
-// PATH: [[TAG_C_b_a_f32]] = !{[[TYPE_C:!.*]], [[TYPE_INT]], i64 12}
-// PATH: [[TYPE_C]] = !{!"_ZTS7StructC", [[TYPE_SHORT]], i64 0, [[TYPE_B]], i64 4, [[TYPE_INT]], i64 28}
-// PATH: [[TAG_D_b_a_f32]] = !{[[TYPE_D:!.*]], [[TYPE_INT]], i64 12}
-// PATH: [[TYPE_D]] = !{!"_ZTS7StructD", [[TYPE_SHORT]], i64 0, [[TYPE_B]], i64 4, [[TYPE_INT]], i64 28, [[TYPE_CHAR]], i64 32}
+// OLD-PATH: [[TYPE_CHAR:!.*]] = !{!"omnipotent char", !
+// OLD-PATH: [[TAG_i32]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
+// OLD-PATH: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR]]
+// OLD-PATH: [[TAG_A_f32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 4}
+// OLD-PATH: [[TYPE_A]] = !{!"_ZTS7StructA", [[TYPE_SHORT:!.*]], i64 0, [[TYPE_INT]], i64 4, [[TYPE_SHORT]], i64 8, [[TYPE_INT]], i64 12}
+// OLD-PATH: [[TYPE_SHORT:!.*]] = !{!"short", [[TYPE_CHAR]]
+// OLD-PATH: [[TAG_A_f16]] = !{[[TYPE_A]], [[TYPE_SHORT]], i64 0}
+// OLD-PATH: [[TAG_B_a_f32]] = !{[[TYPE_B:!.*]], [[TYPE_INT]], i64 8}
+// OLD-PATH: [[TYPE_B]] = !{!"_ZTS7StructB", [[TYPE_SHORT]], i64 0, [[TYPE_A]], i64 4, [[TYPE_INT]], i64 20}
+// OLD-PATH: [[TAG_B_a_f16]] = !{[[TYPE_B]], [[TYPE_SHORT]], i64 4}
+// OLD-PATH: [[TAG_B_f32]] = !{[[TYPE_B]], [[TYPE_INT]], i64 20}
+// OLD-PATH: [[TAG_B_a_f32_2]] = !{[[TYPE_B]], [[TYPE_INT]], i64 16}
+// OLD-PATH: [[TAG_S_f32]] = !{[[TYPE_S:!.*]], [[TYPE_INT]], i64 4}
+// OLD-PATH: [[TYPE_S]] = !{!"_ZTS7StructS", [[TYPE_SHORT]], i64 0, [[TYPE_INT]], i64 4}
+// OLD-PATH: [[TAG_S_f16]] = !{[[TYPE_S]], [[TYPE_SHORT]], i64 0}
+// OLD-PATH: [[TAG_S2_f32_2]] = !{[[TYPE_S2:!.*]], [[TYPE_INT]], i64 12}
+// OLD-PATH: [[TYPE_S2]] = !{!"_ZTS8StructS2", [[TYPE_SHORT]], i64 8, [[TYPE_INT]], i64 12}
+// OLD-PATH: [[TAG_C_b_a_f32]] = !{[[TYPE_C:!.*]], [[TYPE_INT]], i64 12}
+// OLD-PATH: [[TYPE_C]] = !{!"_ZTS7StructC", [[TYPE_SHORT]], i64 0, [[TYPE_B]], i64 4, [[TYPE_INT]], i64 28}
+// OLD-PATH: [[TAG_D_b_a_f32]] = !{[[TYPE_D:!.*]], [[TYPE_INT]], i64 12}
+// OLD-PATH: [[TYPE_D]] = !{!"_ZTS7StructD", [[TYPE_SHORT]], i64 0, [[TYPE_B]], i64 4, [[TYPE_INT]], i64 28, [[TYPE_CHAR]], i64 32}
+
+// NEW-PATH: [[TYPE_CHAR:!.*]] = !{!{{.*}}, i64 1, !"omnipotent char"}
+// NEW-PATH: [[TAG_i32]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0, i64 4}
+// NEW-PATH: [[TYPE_INT]] = !{[[TYPE_CHAR]], i64 4, !"int"}
+// NEW-PATH: [[TAG_A_f32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 4, i64 4}
+// NEW-PATH: [[TYPE_A]] = !{[[TYPE_CHAR]], i64 16, !"_ZTS7StructA", [[TYPE_SHORT:!.*]], i64 0, i64 2, [[TYPE_INT]], i64 4, i64 4, [[TYPE_SHORT]], i64 8, i64 2, [[TYPE_INT]], i64 12, i64 4}
+// NEW-PATH: [[TYPE_SHORT:!.*]] = !{[[TYPE_CHAR]], i64 2, !"short"}
+// NEW-PATH: [[TAG_A_f16]] = !{[[TYPE_A]], [[TYPE_SHORT]], i64 0, i64 2}
+// NEW-PATH: [[TAG_B_a_f32]] = !{[[TYPE_B:!.*]], [[TYPE_INT]], i64 8, i64 4}
+// NEW-PATH: [[TYPE_B]] = !{[[TYPE_CHAR]], i64 24, !"_ZTS7StructB", [[TYPE_SHORT]], i64 0, i64 2, [[TYPE_A]], i64 4, i64 16, [[TYPE_INT]], i64 20, i64 4}
+// NEW-PATH: [[TAG_B_a_f16]] = !{[[TYPE_B]], [[TYPE_SHORT]], i64 4, i64 2}
+// NEW-PATH: [[TAG_B_f32]] = !{[[TYPE_B]], [[TYPE_INT]], i64 20, i64 4}
+// NEW-PATH: [[TAG_B_a_f32_2]] = !{[[TYPE_B]], [[TYPE_INT]], i64 16, i64 4}
+// NEW-PATH: [[TAG_S_f32]] = !{[[TYPE_S:!.*]], [[TYPE_INT]], i64 4, i64 4}
+// NEW-PATH: [[TYPE_S]] = !{[[TYPE_CHAR]], i64 8, !"_ZTS7StructS", [[TYPE_SHORT]], i64 0, i64 2, [[TYPE_INT]], i64 4, i64 4}
+// NEW-PATH: [[TAG_S_f16]] = !{[[TYPE_S]], [[TYPE_SHORT]], i64 0, i64 2}
+// NEW-PATH: [[TAG_S2_f32_2]] = !{[[TYPE_S2:!.*]], [[TYPE_INT]], i64 12, i64 4}
+// NEW-PATH: [[TYPE_S2]] = !{[[TYPE_CHAR]], i64 16, !"_ZTS8StructS2", [[TYPE_SHORT]], i64 8, i64 2, [[TYPE_INT]], i64 12, i64 4}
+// NEW-PATH: [[TAG_C_b_a_f32]] = !{[[TYPE_C:!.*]], [[TYPE_INT]], i64 12, i64 4}
+// NEW-PATH: [[TYPE_C]] = !{[[TYPE_CHAR]], i64 32, !"_ZTS7StructC", [[TYPE_SHORT]], i64 0, i64 2, [[TYPE_B]], i64 4, i64 24, [[TYPE_INT]], i64 28, i64 4}
+// NEW-PATH: [[TAG_D_b_a_f32]] = !{[[TYPE_D:!.*]], [[TYPE_INT]], i64 12, i64 4}
+// NEW-PATH: [[TYPE_D]] = !{[[TYPE_CHAR]], i64 36, !"_ZTS7StructD", [[TYPE_SHORT]], i64 0, i64 2, [[TYPE_B]], i64 4, i64 24, [[TYPE_INT]], i64 28, i64 4, [[TYPE_CHAR]], i64 32, i64 1}
diff --git a/test/CodeGen/tbaa-for-vptr.cpp b/test/CodeGen/tbaa-for-vptr.cpp
index 6136874cbfcb..1139749f0fad 100644
--- a/test/CodeGen/tbaa-for-vptr.cpp
+++ b/test/CodeGen/tbaa-for-vptr.cpp
@@ -1,6 +1,10 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - -fsanitize=thread %s | FileCheck %s
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - -O1 %s | FileCheck %s
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - -O1  -relaxed-aliasing -fsanitize=thread %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - -fsanitize=thread %s | FileCheck %s --check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - -O1 %s | FileCheck %s --check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - -O1  -relaxed-aliasing -fsanitize=thread %s | FileCheck %s --check-prefixes=CHECK,OLD-PATH
+//
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -new-struct-path-tbaa -o - -fsanitize=thread %s | FileCheck %s --check-prefixes=CHECK,NEW-PATH
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -new-struct-path-tbaa -o - -O1 %s | FileCheck %s --check-prefixes=CHECK,NEW-PATH
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -new-struct-path-tbaa -o - -O1  -relaxed-aliasing -fsanitize=thread %s | FileCheck %s --check-prefixes=CHECK,NEW-PATH
 //
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s --check-prefix=NOTBAA
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - -O2  -relaxed-aliasing %s | FileCheck %s --check-prefix=NOTBAA
@@ -30,6 +34,8 @@ void CallFoo(A *a, int (A::*fp)() const) {
 // CHECK-LABEL: @_ZN1AC2Ev
 // CHECK: store i32 (...)** {{.*}}, !tbaa ![[NUM]]
 //
-// CHECK: [[NUM]] = !{[[TYPE:!.*]], [[TYPE]], i64 0}
-// CHECK: [[TYPE]] = !{!"vtable pointer", !{{.*}}
+// OLD-PATH: [[NUM]] = !{[[TYPE:!.*]], [[TYPE]], i64 0}
+// OLD-PATH: [[TYPE]] = !{!"vtable pointer", !{{.*}}
+// NEW-PATH: [[NUM]] = !{[[TYPE:!.*]], [[TYPE]], i64 0, i64 [[POINTER_SIZE:.*]]}
+// NEW-PATH: [[TYPE]] = !{!{{.*}}, i64 [[POINTER_SIZE]], !"vtable pointer"}
 // NOTBAA-NOT: = !{!"Simple C++ TBAA"}
diff --git a/test/CodeGen/tbaa-ms-abi.cpp b/test/CodeGen/tbaa-ms-abi.cpp
index 51100b18c3d4..c7f258b50224 100644
--- a/test/CodeGen/tbaa-ms-abi.cpp
+++ b/test/CodeGen/tbaa-ms-abi.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple i686-pc-win32 -disable-llvm-passes -emit-llvm -o - -O1 %s | FileCheck %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -disable-llvm-passes -emit-llvm -o - -O1 %s | FileCheck %s -check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -triple i686-pc-win32 -disable-llvm-passes -emit-llvm -new-struct-path-tbaa -o - -O1 %s | FileCheck %s -check-prefixes=CHECK,NEW-PATH
 //
 // Test that TBAA works in the Microsoft C++ ABI.  We used to error out while
 // attempting to mangle RTTI.
@@ -16,7 +17,11 @@ StructB::StructB() {
 // CHECK: store i32 42, i32* {{.*}}, !tbaa [[TAG_A_i32:!.*]]
 }
 
-// CHECK: [[TYPE_INT:!.*]] = !{!"int", [[TYPE_CHAR:!.*]], i64 0}
-// CHECK: [[TYPE_CHAR]] = !{!"omnipotent char", !
-// CHECK: [[TAG_A_i32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 0}
-// CHECK: [[TYPE_A]] = !{!"?AUStructA@@", [[TYPE_INT]], i64 0}
+// OLD-PATH: [[TYPE_INT:!.*]] = !{!"int", [[TYPE_CHAR:!.*]], i64 0}
+// OLD-PATH: [[TYPE_CHAR]] = !{!"omnipotent char", !
+// OLD-PATH: [[TAG_A_i32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 0}
+// OLD-PATH: [[TYPE_A]] = !{!"?AUStructA@@", [[TYPE_INT]], i64 0}
+// NEW-PATH: [[TYPE_INT:!.*]] = !{[[TYPE_CHAR:!.*]], i64 4, !"int"}
+// NEW-PATH: [[TYPE_CHAR]] = !{{{.*}}, i64 1, !"omnipotent char"}
+// NEW-PATH: [[TAG_A_i32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 0, i64 4}
+// NEW-PATH: [[TYPE_A]] = !{[[TYPE_CHAR]], i64 4, !"?AUStructA@@", [[TYPE_INT]], i64 0, i64 4}
diff --git a/test/CodeGen/tbaa-reference.cpp b/test/CodeGen/tbaa-reference.cpp
index ecdbfbee7f14..113796aa045a 100644
--- a/test/CodeGen/tbaa-reference.cpp
+++ b/test/CodeGen/tbaa-reference.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s  -check-prefixes=CHECK,OLD-PATH
+// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=CHECK,NEW-PATH
 //
 // Check that we generate correct TBAA information for reference accesses.
 
@@ -29,9 +30,16 @@ S &B::get() {
   return s;
 }
 
-// CHECK-DAG: [[TAG_pointer]] = !{[[TYPE_pointer:!.*]], [[TYPE_pointer]], i64 0}
-// CHECK-DAG: [[TAG_B_s]] = !{[[TYPE_B:!.*]], [[TYPE_pointer]], i64 0}
+// OLD-PATH-DAG: [[TAG_pointer]] = !{[[TYPE_pointer:!.*]], [[TYPE_pointer]], i64 0}
+// OLD-PATH-DAG: [[TAG_B_s]] = !{[[TYPE_B:!.*]], [[TYPE_pointer]], i64 0}
 //
-// CHECK-DAG: [[TYPE_B]] = !{!"_ZTS1B", [[TYPE_pointer]], i64 0}
-// CHECK-DAG: [[TYPE_pointer]] = !{!"any pointer", [[TYPE_char:!.*]], i64 0}
-// CHECK-DAG: [[TYPE_char]] = !{!"omnipotent char", {{!.*}}, i64 0}
+// OLD-PATH-DAG: [[TYPE_B]] = !{!"_ZTS1B", [[TYPE_pointer]], i64 0}
+// OLD-PATH-DAG: [[TYPE_pointer]] = !{!"any pointer", [[TYPE_char:!.*]], i64 0}
+// OLD-PATH-DAG: [[TYPE_char]] = !{!"omnipotent char", {{!.*}}, i64 0}
+
+// NEW-PATH-DAG: [[TAG_pointer]] = !{[[TYPE_pointer:!.*]], [[TYPE_pointer]], i64 0, i64 8}
+// NEW-PATH-DAG: [[TAG_B_s]] = !{[[TYPE_B:!.*]], [[TYPE_pointer]], i64 0, i64 8}
+//
+// NEW-PATH-DAG: [[TYPE_B]] = !{[[TYPE_char:!.*]], i64 8, !"_ZTS1B", [[TYPE_pointer]], i64 0, i64 8}
+// NEW-PATH-DAG: [[TYPE_pointer]] = !{[[TYPE_char:!.*]], i64 8, !"any pointer"}
+// NEW-PATH-DAG: [[TYPE_char]] = !{{{!.*}}, i64 1, !"omnipotent char"}
diff --git a/test/CodeGen/tbaa-struct.cpp b/test/CodeGen/tbaa-struct.cpp
index 2623c42c9a3f..9e5429b2e360 100644
--- a/test/CodeGen/tbaa-struct.cpp
+++ b/test/CodeGen/tbaa-struct.cpp
@@ -1,6 +1,11 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - -O1 %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-OLD %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -new-struct-path-tbaa \
+// RUN:     -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-NEW %s
 //
-// Check that we generate !tbaa.struct metadata for struct copies.
+// Check that we generate TBAA metadata for struct copies correctly.
+
 struct A {
   short s;
   int i;
@@ -8,68 +13,117 @@ struct A {
   int j;
 };
 
-void copy(struct A *a, struct A *b) {
-  *a = *b;
-}
+typedef A __attribute__((may_alias)) AA;
 
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 16, i32 4, i1 false), !tbaa.struct [[TS:!.*]]
+void copy(A *a1, A *a2) {
+// CHECK-LABEL: _Z4copyP1AS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 16, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_A:![0-9]*]]
+  *a1 = *a2;
+}
 
 struct B {
-  char c1;
-  struct A a;
-  int ii;
+  char c;
+  A a;
+  int i;
 };
 
-void copy2(struct B *a, struct B *b) {
-  *a = *b;
+void copy2(B *b1, B *b2) {
+// CHECK-LABEL: _Z5copy2P1BS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 24, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS2:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_B:![0-9]*]]
+  *b1 = *b2;
 }
 
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 24, i32 4, i1 false), !tbaa.struct [[TS2:!.*]]
+struct S {
+  _Complex char cc;
+  _Complex int ci;
+};
 
-typedef _Complex int T2;
-typedef _Complex char T5;
-typedef _Complex int T7;
-typedef struct T4 { T5 field0; T7 field1; } T4;
-typedef union T1 { T2 field0; T4 field1; } T1;
+union U {
+  _Complex int ci;
+  S s;
+};
 
-void copy3 (T1 *a, T1 *b) {
-  *a = *b;
+void copy3(U *u1, U *u2) {
+// CHECK-LABEL: _Z5copy3P1US0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 12, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS3:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_U:![0-9]*]]
+  *u1 = *u2;
 }
 
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 12, i32 4, i1 false), !tbaa.struct [[TS3:!.*]]
-
 // Make sure that zero-length bitfield works.
-#define ATTR __attribute__ ((ms_struct))
-struct five {
+struct C {
   char a;
-  int :0;        /* ignored; prior field is not a bitfield. */
+  int : 0;  // Shall not be ignored; see r185018.
   char b;
   char c;
-} ATTR;
-void copy4(struct five *a, struct five *b) {
-  *a = *b;
+} __attribute__((ms_struct));
+
+void copy4(C *c1, C *c2) {
+// CHECK-LABEL: _Z5copy4P1CS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 3, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS4:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_C:![0-9]*]]
+  *c1 = *c2;
 }
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 3, i32 1, i1 false), !tbaa.struct [[TS4:!.*]]
 
-struct six {
+struct D {
   char a;
-  int :0;
+  int : 0;
   char b;
   char c;
 };
-void copy5(struct six *a, struct six *b) {
-  *a = *b;
+
+void copy5(D *d1, D *d2) {
+// CHECK-LABEL: _Z5copy5P1DS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 6, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS5:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_D:![0-9]*]]
+  *d1 = *d2;
+}
+
+void copy6(AA *a1, A *a2) {
+// CHECK-LABEL: _Z5copy6P1AS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 16, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS]]
+// CHECK-NEW-SAME: !tbaa [[TAG_char:![0-9]*]]
+  *a1 = *a2;
 }
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 6, i32 1, i1 false), !tbaa.struct [[TS5:!.*]]
 
-// CHECK: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
-// CHECK: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
-// CHECK: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
-// CHECK: [[INT]] = !{!"int", [[CHAR]]
-// CHECK: [[TAG_CHAR:!.*]] = !{[[CHAR]], [[CHAR]], i64 0}
+void copy7(A *a1, AA *a2) {
+// CHECK-LABEL: _Z5copy7P1AS0_
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 16, i1 false)
+// CHECK-OLD-SAME: !tbaa.struct [[TS]]
+// CHECK-NEW-SAME: !tbaa [[TAG_char]]
+  *a1 = *a2;
+}
+
+// CHECK-OLD: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
+// CHECK-OLD: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
+// CHECK-OLD: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
+// CHECK-OLD: [[INT]] = !{!"int", [[CHAR]]
+// CHECK-OLD: [[TAG_CHAR:!.*]] = !{[[CHAR]], [[CHAR]], i64 0}
 // (offset, size) = (0,1) char; (4,2) short; (8,4) int; (12,1) char; (16,4) int; (20,4) int
-// CHECK: [[TS2]] = !{i64 0, i64 1, !{{.*}}, i64 4, i64 2, !{{.*}}, i64 8, i64 4, !{{.*}}, i64 12, i64 1, !{{.*}}, i64 16, i64 4, {{.*}}, i64 20, i64 4, {{.*}}}
+// CHECK-OLD: [[TS2]] = !{i64 0, i64 1, !{{.*}}, i64 4, i64 2, !{{.*}}, i64 8, i64 4, !{{.*}}, i64 12, i64 1, !{{.*}}, i64 16, i64 4, {{.*}}, i64 20, i64 4, {{.*}}}
 // (offset, size) = (0,8) char; (0,2) char; (4,8) char
-// CHECK: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
-// CHECK: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 4, [[TAG_INT]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
-// CHECK: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
+// CHECK-OLD: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
+// CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 4, [[TAG_INT]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
+// CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
+
+// CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
+// CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
+// CHECK-NEW-DAG: [[TYPE_short:!.*]] = !{[[TYPE_char]], i64 2, !"short"}
+// CHECK-NEW-DAG: [[TYPE_int:!.*]] = !{[[TYPE_char]], i64 4, !"int"}
+// CHECK-NEW-DAG: [[TYPE_A:!.*]] = !{[[TYPE_char]], i64 16, !"_ZTS1A", [[TYPE_short]], i64 0, i64 2, [[TYPE_int]], i64 4, i64 4, [[TYPE_char]], i64 8, i64 1, [[TYPE_int]], i64 12, i64 4}
+// CHECK-NEW-DAG: [[TAG_A]] = !{[[TYPE_A]], [[TYPE_A]], i64 0, i64 16}
+// CHECK-NEW-DAG: [[TYPE_B:!.*]] = !{[[TYPE_char]], i64 24, !"_ZTS1B", [[TYPE_char]], i64 0, i64 1, [[TYPE_A]], i64 4, i64 16, [[TYPE_int]], i64 20, i64 4}
+// CHECK-NEW-DAG: [[TAG_B]] = !{[[TYPE_B]], [[TYPE_B]], i64 0, i64 24}
+// CHECK-NEW-DAG: [[TAG_U]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 12}
+// CHECK-NEW-DAG: [[TYPE_C:!.*]] = !{[[TYPE_char]], i64 3, !"_ZTS1C", [[TYPE_char]], i64 0, i64 1, [[TYPE_int]], i64 1, i64 4, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1}
+// CHECK-NEW-DAG: [[TAG_C]] = !{[[TYPE_C]], [[TYPE_C]], i64 0, i64 3}
+// CHECK-NEW-DAG: [[TYPE_D:!.*]] = !{[[TYPE_char]], i64 6, !"_ZTS1D", [[TYPE_char]], i64 0, i64 1, [[TYPE_int]], i64 4, i64 4, [[TYPE_char]], i64 4, i64 1, [[TYPE_char]], i64 5, i64 1}
+// CHECK-NEW-DAG: [[TAG_D]] = !{[[TYPE_D]], [[TYPE_D]], i64 0, i64 6}
diff --git a/test/CodeGen/tentative-decls.c b/test/CodeGen/tentative-decls.c
index d88c346d7c52..321778720587 100644
--- a/test/CodeGen/tentative-decls.c
+++ b/test/CodeGen/tentative-decls.c
@@ -1,16 +1,16 @@
-// RUN: %clang_cc1 -emit-llvm -o %t %s
+// RUN: %clang_cc1 -emit-llvm -w -o - %s | FileCheck %s
 
-// RUN: grep '@r = common global \[1 x .*\] zeroinitializer' %t
+// CHECK-DAG: @r = common {{(dso_local )?}}global [1 x {{.*}}] zeroinitializer
 
 int r[];
 int (*a)[] = &r;
 
 struct s0;
 struct s0 x;
-// RUN: grep '@x = common global .struct.s0 zeroinitializer' %t
+// CHECK-DAG: @x = common {{(dso_local )?}}global %struct.s0 zeroinitializer
 
 struct s0 y;
-// RUN: grep '@y = common global .struct.s0 zeroinitializer' %t
+// CHECK-DAG: @y = common {{(dso_local )?}}global %struct.s0 zeroinitializer
 struct s0 *f0() {
   return &y;
 }
@@ -19,19 +19,19 @@ struct s0 {
   int x;
 };
 
-// RUN: grep '@b = common global \[1 x .*\] zeroinitializer' %t
+// CHECK-DAG: @b = common {{(dso_local )?}}global [1 x {{.*}}] zeroinitializer
 int b[];
 int *f1() {
   return b;
 }
 
 // Check that the most recent tentative definition wins.
-// RUN: grep '@c = common global \[4 x .*\] zeroinitializer' %t
+// CHECK-DAG: @c = common {{(dso_local )?}}global [4 x {{.*}}] zeroinitializer
 int c[];
 int c[4];
 
 // Check that we emit static tentative definitions
-// RUN: grep '@c5 = internal global \[1 x .*\] zeroinitializer' %t
+// CHECK-DAG: @c5 = internal global [1 x {{.*}}] zeroinitializer
 static int c5[];
 static int func() { return c5[0]; }
 int callfunc() { return func(); }
diff --git a/test/CodeGen/thinlto-backend-option.ll b/test/CodeGen/thinlto-backend-option.ll
index 4fcdd079df84..b25b584a2fe1 100644
--- a/test/CodeGen/thinlto-backend-option.ll
+++ b/test/CodeGen/thinlto-backend-option.ll
@@ -1,4 +1,4 @@
-; Test to ensure -backend-options work when invoking the ThinLTO backend path.
+; Test to ensure -mllvm work when invoking the ThinLTO backend path.
 
 ; This test uses a non-existent backend option to test that backend options are
 ; being parsed. While it's more important that the existing options are parsed
@@ -8,8 +8,8 @@
 
 ; RUN: %clang -flto=thin -c -o %t.o %s
 ; RUN: llvm-lto -thinlto -o %t %t.o
-; RUN: not %clang_cc1 -x ir %t.o -fthinlto-index=%t.thinlto.bc -backend-option -nonexistent -emit-obj -o /dev/null 2>&1 | FileCheck %s -check-prefix=UNKNOWN
-; UNKNOWN: clang: Unknown command line argument '-nonexistent'
+; RUN: not %clang_cc1 -x ir %t.o -fthinlto-index=%t.thinlto.bc -mllvm -nonexistent -emit-obj -o /dev/null 2>&1 | FileCheck %s -check-prefix=UNKNOWN
+; UNKNOWN: clang (LLVM option parsing): Unknown command line argument '-nonexistent'
 
 ; RUN: not %clang_cc1 -flto=thinfoo 2>&1 | FileCheck %s -check-prefix=INVALID
 ; INVALID: error: invalid value 'thinfoo' in '-flto=thinfoo'
diff --git a/test/CodeGen/thinlto-diagnostic-handler-remarks-with-hotness.ll b/test/CodeGen/thinlto-diagnostic-handler-remarks-with-hotness.ll
new file mode 100644
index 000000000000..09a2ec890ff9
--- /dev/null
+++ b/test/CodeGen/thinlto-diagnostic-handler-remarks-with-hotness.ll
@@ -0,0 +1,49 @@
+; Test to ensure -fdiagnostics-show-hotness and -fsave-optimization-record
+; work when invoking the ThinLTO backend path.
+; REQUIRES: x86-registered-target
+
+; RUN: opt -module-summary -o %t.o %s
+; RUN: llvm-lto -thinlto -o %t %t.o
+
+; First try YAML pass remarks file
+; RUN: rm -f %t2.opt.yaml
+; RUN: %clang -target x86_64-scei-ps4 -O2 -x ir %t.o -fthinlto-index=%t.thinlto.bc -fsave-optimization-record -fdiagnostics-show-hotness -o %t2.o -c
+; RUN: cat %t2.opt.yaml | FileCheck %s -check-prefix=YAML
+
+; YAML: --- !Passed
+; YAML-NEXT: Pass:            inline
+; YAML-NEXT: Name:            Inlined
+; YAML-NEXT: Function:        main
+; YAML-NEXT: Hotness:         300
+; YAML-NEXT: Args:
+; YAML-NEXT:   - Callee:          tinkywinky
+; YAML-NEXT:   - String:          ' inlined into '
+; YAML-NEXT:   - Caller:          main
+; YAML-NEXT:   - String:          ' with cost='
+; YAML-NEXT:   - Cost:            '0'
+; YAML-NEXT:   - String:          ' (threshold='
+; YAML-NEXT:   - Threshold:       '337'
+; YAML-NEXT:   - String:          ')'
+; YAML-NEXT: ...
+
+; Next try with pass remarks to stderr
+; RUN: %clang -target x86_64-scei-ps4 -O2 -x ir %t.o -fthinlto-index=%t.thinlto.bc -mllvm -pass-remarks=inline -fdiagnostics-show-hotness -o %t2.o -c 2>&1 | FileCheck %s
+
+; CHECK: tinkywinky inlined into main with cost=0 (threshold=337) (hotness: 300)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+declare i32 @patatino()
+
+define i32 @tinkywinky() {
+  %a = call i32 @patatino()
+  ret i32 %a
+}
+
+define i32 @main() !prof !0 {
+  %i = call i32 @tinkywinky()
+  ret i32 %i
+}
+
+!0 = !{!"function_entry_count", i64 300}
diff --git a/test/CodeGen/thinlto-distributed-backend-skip.ll b/test/CodeGen/thinlto-distributed-backend-skip.ll
new file mode 100644
index 000000000000..d9fa47d23dfc
--- /dev/null
+++ b/test/CodeGen/thinlto-distributed-backend-skip.ll
@@ -0,0 +1,21 @@
+; REQUIRES: x86-registered-target
+
+; Check that ThinLTO backend respects "SkipModuleByDistributedBackend"
+; flag which can be set by indexing.
+
+; RUN: opt -thinlto-bc -o %t.o %s
+
+; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
+; RUN:   -fthinlto-index=%S/Inputs/thinlto-distributed-backend-skip.bc \
+; RUN:   -emit-llvm -o - -x ir %t.o | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+; CHECK: "empty"
+; CHECK: target triple =
+; CHECK-NOT: @main
+define i32 @main() {
+entry:
+  ret i32 0
+}
diff --git a/test/CodeGen/thinlto-distributed-cfi-devirt.ll b/test/CodeGen/thinlto-distributed-cfi-devirt.ll
new file mode 100644
index 000000000000..ab33a59b6982
--- /dev/null
+++ b/test/CodeGen/thinlto-distributed-cfi-devirt.ll
@@ -0,0 +1,109 @@
+; REQUIRES: x86-registered-target
+
+; Backend test for distribute ThinLTO with CFI.
+; It additionally enables -fwhole-program-vtables to get more information in
+; TYPE_IDs of GLOBALVAL_SUMMARY_BLOCK.
+
+; RUN: opt -thinlto-bc -o %t.o %s
+
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN:   -o %t2.index \
+; RUN:   -r=%t.o,test,px \
+; RUN:   -r=%t.o,_ZN1A1nEi,p \
+; RUN:   -r=%t.o,_ZN1B1fEi,p \
+; RUN:   -r=%t.o,_ZN1C1fEi,p \
+; RUN:   -r=%t.o,_ZTV1B, \
+; RUN:   -r=%t.o,_ZTV1C, \
+; RUN:   -r=%t.o,_ZN1A1nEi, \
+; RUN:   -r=%t.o,_ZN1B1fEi, \
+; RUN:   -r=%t.o,_ZN1C1fEi, \
+; RUN:   -r=%t.o,_ZTV1B,px \
+; RUN:   -r=%t.o,_ZTV1C,px
+
+; Ensure that typeids are in the index.
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s
+; CHECK-LABEL: <GLOBALVAL_SUMMARY_BLOCK
+; CHECK: <TYPE_ID op0=0 op1=6 op2=4 op3=7 op4=0 op5=0 op6=0 op7=0 op8=0 op9=2 op10=6 op11=0 op12=0 op13=8 op14=1 op15=6 op16=9 op17=0/>
+; CHECK-LABEL: </GLOBALVAL_SUMMARY_BLOCK
+; CHECK-LABEL: <STRTAB_BLOCK
+; CHECK: blob data = '_ZTS1A_ZN1A1nEi'
+; CHECK-LABEL: </STRTAB_BLOCK
+
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=CHECK-DIS
+; Round trip it through llvm-as
+; RUN: llvm-dis %t.o.thinlto.bc -o - | llvm-as -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-DIS
+; CHECK-DIS: ^0 = module: (path: "{{.*}}thinlto-distributed-cfi-devirt.ll.tmp.o", hash: ({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}))
+; CHECK-DIS: ^1 = gv: (guid: 8346051122425466633, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 1, dsoLocal: 0), insts: 18, typeIdInfo: (typeTests: (^2), typeCheckedLoadVCalls: (vFuncId: (^2, offset: 8), vFuncId: (^2, offset: 0))))))
+; CHECK-DIS: ^2 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi"))))) ; guid = 7004155349499253778
+
+; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -emit-llvm -o - -x ir %t.o | FileCheck %s --check-prefixes=CHECK-IR
+
+; Check that backend does not fail generating native code.
+; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -o %t.native.o -x ir %t.o
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+%struct.A = type { i32 (...)** }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+
+@_ZTV1B = constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%struct.B*, i32)* @_ZN1B1fEi to i8*), i8* bitcast (i32 (%struct.A*, i32)* @_ZN1A1nEi to i8*)] }, !type !0, !type !1
+@_ZTV1C = constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%struct.C*, i32)* @_ZN1C1fEi to i8*), i8* bitcast (i32 (%struct.A*, i32)* @_ZN1A1nEi to i8*)] }, !type !0, !type !2
+
+; CHECK-IR-LABEL: define i32 @test
+define i32 @test(%struct.A* %obj, i32 %a) {
+entry:
+  %0 = bitcast %struct.A* %obj to i8**
+  %vtable5 = load i8*, i8** %0
+
+  %1 = tail call { i8*, i1 } @llvm.type.checked.load(i8* %vtable5, i32 8, metadata !"_ZTS1A")
+  %2 = extractvalue { i8*, i1 } %1, 1
+  br i1 %2, label %cont, label %trap
+
+trap:
+  tail call void @llvm.trap()
+  unreachable
+
+cont:
+  %3 = extractvalue { i8*, i1 } %1, 0
+  %4 = bitcast i8* %3 to i32 (%struct.A*, i32)*
+
+  ; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  %call = tail call i32 %4(%struct.A* nonnull %obj, i32 %a)
+  %vtable16 = load i8*, i8** %0
+  %5 = tail call { i8*, i1 } @llvm.type.checked.load(i8* %vtable16, i32 0, metadata !"_ZTS1A")
+  %6 = extractvalue { i8*, i1 } %5, 1
+  br i1 %6, label %cont2, label %trap
+
+cont2:
+  %7 = extractvalue { i8*, i1 } %5, 0
+  %8 = bitcast i8* %7 to i32 (%struct.A*, i32)*
+
+  ; Check that traps are conditional. Invalid TYPE_ID can cause
+  ; unconditional traps.
+  ; CHECK-IR: br i1 {{.*}}, label %trap
+
+  ; We still have to call it as virtual.
+  ; CHECK-IR: %call3 = tail call i32 %8
+  %call3 = tail call i32 %8(%struct.A* nonnull %obj, i32 %call)
+  ret i32 %call3
+}
+; CHECK-IR-LABEL: ret i32
+; CHECK-IR-LABEL: }
+
+declare { i8*, i1 } @llvm.type.checked.load(i8*, i32, metadata)
+declare void @llvm.trap()
+
+declare i32 @_ZN1B1fEi(%struct.B* %this, i32 %a)
+declare i32 @_ZN1A1nEi(%struct.A* %this, i32 %a)
+declare i32 @_ZN1C1fEi(%struct.C* %this, i32 %a)
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTS1B"}
+!2 = !{i64 16, !"_ZTS1C"}
diff --git a/test/CodeGen/thinlto-distributed-cfi.ll b/test/CodeGen/thinlto-distributed-cfi.ll
new file mode 100644
index 000000000000..055b5971eb23
--- /dev/null
+++ b/test/CodeGen/thinlto-distributed-cfi.ll
@@ -0,0 +1,74 @@
+; REQUIRES: x86-registered-target
+
+; Backend test for distribute ThinLTO with CFI.
+
+; RUN: opt -thinlto-bc -o %t.o %s
+
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN:   -o %t2.index \
+; RUN:   -r=%t.o,test,px \
+; RUN:   -r=%t.o,_ZTV1B, \
+; RUN:   -r=%t.o,_ZN1B1fEi, \
+; RUN:   -r=%t.o,_ZTV1B,px
+
+; Check that typeids are in the index.
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s
+; CHECK-LABEL: <GLOBALVAL_SUMMARY_BLOCK
+; CHECK: <TYPE_ID op0=0 op1=6 op2=3 op3=0 op4=0 op5=0 op6=0 op7=0/>
+; CHECK-LABEL: </GLOBALVAL_SUMMARY_BLOCK
+; CHECK-LABEL: <STRTAB_BLOCK
+; CHECK: blob data = '_ZTS1A'
+; CHECK-LABEL: </STRTAB_BLOCK
+
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=CHECK-DIS
+; Round trip it through llvm-as
+; RUN: llvm-dis %t.o.thinlto.bc -o - | llvm-as -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-DIS
+; CHECK-DIS: ^0 = module: (path: "{{.*}}thinlto-distributed-cfi.ll.tmp.o", hash: ({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}))
+; CHECK-DIS: ^1 = gv: (guid: 8346051122425466633, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 1, dsoLocal: 0), insts: 7, typeIdInfo: (typeTests: (^2)))))
+; CHECK-DIS: ^2 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0))) ; guid = 7004155349499253778
+
+; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -emit-llvm -o - -x ir %t.o | FileCheck %s --check-prefixes=CHECK-IR
+
+; Ensure that backend does not fail generating native code.
+; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -o %t.native.o -x ir %t.o
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+%struct.B = type { %struct.A }
+%struct.A = type { i32 (...)** }
+
+@_ZTV1B = constant { [3 x i8*] } { [3 x i8*] [i8* undef, i8* undef, i8* undef] }, !type !0
+
+; CHECK-IR-LABEL: define void @test
+define void @test(i8* %b) {
+entry:
+  ; Ensure that traps are conditional. Invalid TYPE_ID can cause
+  ; unconditional traps.
+  ; CHECK-IR: br i1 {{.*}}, label %trap
+  %0 = bitcast i8* %b to i8**
+  %vtable2 = load i8*, i8** %0
+  %1 = tail call i1 @llvm.type.test(i8* %vtable2, metadata !"_ZTS1A")
+  br i1 %1, label %cont, label %trap
+
+trap:
+  tail call void @llvm.trap()
+  unreachable
+
+cont:
+  ; CHECK-IR-LABEL: ret void
+  ret void
+}
+; CHECK-IR-LABEL: }
+
+declare i1 @llvm.type.test(i8*, metadata)
+declare void @llvm.trap()
+
+declare i32 @_ZN1B1fEi(%struct.B* %this, i32 %a)
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTS1B"}
diff --git a/test/CodeGen/thinlto-distributed.ll b/test/CodeGen/thinlto-distributed.ll
new file mode 100644
index 000000000000..ae21d7b3e9a3
--- /dev/null
+++ b/test/CodeGen/thinlto-distributed.ll
@@ -0,0 +1,21 @@
+; REQUIRES: x86-registered-target
+
+; Trivial test for distributes ThinLTO
+
+; RUN: opt -thinlto-bc -o %t.o %s
+
+; RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+; RUN:   -o %t2.index \
+; RUN:   -r=%t.o,main,px
+
+; RUN: %clang_cc1 -triple x86_64-grtev4-linux-gnu \
+; RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+; RUN:   -o %t.native.o -x ir %t.o
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+define i32 @main() {
+entry:
+  ret i32 0
+}
diff --git a/test/CodeGen/thinlto-split-dwarf.c b/test/CodeGen/thinlto-split-dwarf.c
new file mode 100644
index 000000000000..2a0d82b34c07
--- /dev/null
+++ b/test/CodeGen/thinlto-split-dwarf.c
@@ -0,0 +1,21 @@
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -debug-info-kind=limited -triple x86_64-unknown-linux-gnu \
+// RUN:   -flto=thin -emit-llvm-bc \
+// RUN:   -o %t.o %s
+
+// RUN: llvm-lto2 run -thinlto-distributed-indexes %t.o \
+// RUN:   -o %t2.index \
+// RUN:   -r=%t.o,main,px
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
+// RUN:   -emit-obj -fthinlto-index=%t.o.thinlto.bc \
+// RUN:   -o %t.native.o -split-dwarf-file %t.native.dwo -x ir %t.o
+
+// RUN: llvm-readobj -sections %t.native.o | FileCheck --check-prefix=O %s
+// RUN: llvm-readobj -sections %t.native.dwo | FileCheck --check-prefix=DWO %s
+
+// O-NOT: .dwo
+// DWO: .dwo
+
+int main() {}
diff --git a/test/CodeGen/thinlto_backend.ll b/test/CodeGen/thinlto_backend.ll
index 86f30c0374fc..5ba846457dfd 100644
--- a/test/CodeGen/thinlto_backend.ll
+++ b/test/CodeGen/thinlto_backend.ll
@@ -1,4 +1,5 @@
-; REQUIRES: x86-registered-target
+; shell required since the windows bot does not like the "(cd ..."
+; REQUIRES: x86-registered-target, shell
 
 ; RUN: opt -module-summary -o %t1.o %s
 ; RUN: opt -module-summary -o %t2.o %S/Inputs/thinlto_backend.ll
@@ -20,8 +21,22 @@
 ; CHECK-OBJ-IGNORE-EMPTY: T f1
 ; CHECK-OBJ-IGNORE-EMPTY: U f2
 
-; Ensure f2 was imported
-; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc
+; Ensure we don't fail with index and non-ThinLTO object file, and output must
+; be empty file.
+; RUN: opt -o %t5.o %s
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t4.o -x ir %t5.o -c -fthinlto-index=%t.thinlto.bc
+; RUN: llvm-nm %t4.o | count 0
+
+; Ensure f2 was imported. Check for all 3 flavors of -save-temps[=cwd|obj].
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc -save-temps=obj
+; RUN: llvm-dis %t1.s.3.import.bc -o - | FileCheck --check-prefix=CHECK-IMPORT %s
+; RUN: mkdir -p %T/dir1
+; RUN: (cd %T/dir1 && %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc -save-temps=cwd)
+; RUN: llvm-dis %T/dir1/*1.s.3.import.bc -o - | FileCheck --check-prefix=CHECK-IMPORT %s
+; RUN: mkdir -p %T/dir2
+; RUN: (cd %T/dir2 && %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc -save-temps)
+; RUN: llvm-dis %T/dir2/*1.s.3.import.bc -o - | FileCheck --check-prefix=CHECK-IMPORT %s
+; CHECK-IMPORT: define available_externally void @f2()
 ; RUN: llvm-nm %t3.o | FileCheck --check-prefix=CHECK-OBJ %s
 ; CHECK-OBJ: T f1
 ; CHECK-OBJ-NOT: U f2
diff --git a/test/CodeGen/transparent-union-redecl.c b/test/CodeGen/transparent-union-redecl.c
new file mode 100644
index 000000000000..31192089827d
--- /dev/null
+++ b/test/CodeGen/transparent-union-redecl.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -Werror -triple i386-linux -emit-llvm -o - %s | FileCheck %s
+
+// Test that different order of declarations is acceptable and that
+// implementing different redeclarations is acceptable.
+// rdar://problem/34949329
+
+typedef union {
+  int i;
+  float f;
+} TU __attribute__((transparent_union));
+
+// CHECK-LABEL: define void @f0(i32 %tu.coerce)
+// CHECK: %tu = alloca %union.TU, align 4
+// CHECK: %coerce.dive = getelementptr inbounds %union.TU, %union.TU* %tu, i32 0, i32 0
+// CHECK: store i32 %tu.coerce, i32* %coerce.dive, align 4
+void f0(TU tu) {}
+void f0(int i);
+
+// CHECK-LABEL: define void @f1(i32 %tu.coerce)
+// CHECK: %tu = alloca %union.TU, align 4
+// CHECK: %coerce.dive = getelementptr inbounds %union.TU, %union.TU* %tu, i32 0, i32 0
+// CHECK: store i32 %tu.coerce, i32* %coerce.dive, align 4
+void f1(int i);
+void f1(TU tu) {}
+
+// CHECK-LABEL: define void @f2(i32 %i)
+// CHECK: %i.addr = alloca i32, align 4
+// CHECK: store i32 %i, i32* %i.addr, align 4
+void f2(TU tu);
+void f2(int i) {}
+
+// CHECK-LABEL: define void @f3(i32 %i)
+// CHECK: %i.addr = alloca i32, align 4
+// CHECK: store i32 %i, i32* %i.addr, align 4
+void f3(int i) {}
+void f3(TU tu);
+
+// Also test functions with parameters specified K&R style.
+// CHECK-LABEL: define void @knrStyle(i32 %tu.coerce)
+// CHECK: %tu = alloca %union.TU, align 4
+// CHECK: %coerce.dive = getelementptr inbounds %union.TU, %union.TU* %tu, i32 0, i32 0
+// CHECK: store i32 %tu.coerce, i32* %coerce.dive, align 4
+void knrStyle(int i);
+void knrStyle(tu) TU tu; {}
diff --git a/test/CodeGen/vaes-builtins.c b/test/CodeGen/vaes-builtins.c
index df160aa16ee3..1dd5784814d3 100644
--- a/test/CodeGen/vaes-builtins.c
+++ b/test/CodeGen/vaes-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vaes -emit-llvm -o - | FileCheck %s --check-prefix AVX
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -DAVX512 -target-feature +vaes -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vaes -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512
 
 #include <immintrin.h>
 
@@ -27,7 +27,7 @@ __m256i test_mm256_aesdeclast_epi128(__m256i __A, __m256i __B) {
   return _mm256_aesdeclast_epi128(__A, __B);
 }
 
-#ifdef AVX512
+#ifdef __AVX512F__
 __m512i test_mm512_aesenc_epi128(__m512i __A, __m512i __B) {
   // AVX512-LABEL: @test_mm512_aesenc_epi128
   // AVX512: @llvm.x86.aesni.aesenc.512
diff --git a/test/CodeGen/variadic-null-win64.c b/test/CodeGen/variadic-null-win64.c
index 23c85b835756..a52fb896ddab 100644
--- a/test/CodeGen/variadic-null-win64.c
+++ b/test/CodeGen/variadic-null-win64.c
@@ -15,7 +15,7 @@ void f(const char *f) {
   v(f, 1, 2, 3, NULL);
   kr(f, 1, 2, 3, 0);
 }
-// WINDOWS: define void @f(i8* %f)
+// WINDOWS: define dso_local void @f(i8* %f)
 // WINDOWS: call void (i8*, ...) @v(i8* {{.*}}, i32 1, i32 2, i32 3, i64 0)
 // WINDOWS: call void bitcast (void (...)* @kr to void (i8*, i32, i32, i32, i32)*)(i8* {{.*}}, i32 1, i32 2, i32 3, i32 0)
 // LINUX: define void @f(i8* %f)
diff --git a/test/CodeGen/vector-scalar.c b/test/CodeGen/vector-scalar.c
new file mode 100644
index 000000000000..0c973cd41ee4
--- /dev/null
+++ b/test/CodeGen/vector-scalar.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
+
+// PR27085
+
+typedef unsigned char uchar4  __attribute__ ((vector_size (4)));
+
+// CHECK: @add2
+// CHECK: add <4 x i8> {{.*}}, <i8 2, i8 2, i8 2, i8 2>
+uchar4 add2(uchar4 v)
+{
+  return v + 2;
+}
+
+// CHECK: @sub2
+// CHECK: sub <4 x i8> {{.*}}, <i8 2, i8 2, i8 2, i8 2>
+uchar4 sub2(uchar4 v)
+{
+  return v - 2;
+}
+
+// CHECK: @mul2
+// CHECK: mul <4 x i8> {{.*}}, <i8 2, i8 2, i8 2, i8 2>
+uchar4 mul2(uchar4 v)
+{
+  return v * 2;
+}
+
+// CHECK: @div2
+// CHECK: udiv <4 x i8> {{.*}}, <i8 2, i8 2, i8 2, i8 2>
+uchar4 div2(uchar4 v)
+{
+  return v / 2;
+}
+
+typedef __attribute__(( ext_vector_type(4) )) unsigned char uchar4_ext;
+
+// CHECK: @div3_ext
+// CHECK: udiv <4 x i8> %{{.*}}, <i8 3, i8 3, i8 3, i8 3>
+uchar4_ext div3_ext(uchar4_ext v)
+{
+  return v / 3;
+}
diff --git a/test/CodeGen/vector.c b/test/CodeGen/vector.c
index ebaea841aa85..c79cd84f7e2e 100644
--- a/test/CodeGen/vector.c
+++ b/test/CodeGen/vector.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ffreestanding -triple i386-apple-darwin9 -O1 -target-cpu core2 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding -triple i386-apple-darwin9 -O1 -target-cpu corei7 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
 
 void test1() {
@@ -45,13 +45,13 @@ int test4(int argc, char *argv[]) {
 
 unsigned long test_epi8(__m128i x) { return _mm_extract_epi8(x, 4); }
 // CHECK: @test_epi8
-// CHECK: extractelement <16 x i8> {{.*}}, i32 4
+// CHECK: extractelement <16 x i8> {{.*}}, {{i32|i64}} 4
 // CHECK: zext i8 {{.*}} to i32
 
 unsigned long test_epi16(__m128i x) { return _mm_extract_epi16(x, 3); }
 
 // CHECK: @test_epi16
-// CHECK: extractelement <8 x i16> {{.*}}, i32 3
+// CHECK: extractelement <8 x i16> {{.*}}, {{i32|i64}} 3
 // CHECK: zext i16 {{.*}} to i32
 
 void extractinttypes() {
diff --git a/test/CodeGen/vectorcall.c b/test/CodeGen/vectorcall.c
index fa244fb908e0..564b41e1f54d 100644
--- a/test/CodeGen/vectorcall.c
+++ b/test/CodeGen/vectorcall.c
@@ -2,56 +2,56 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -ffreestanding -triple=x86_64-pc-win32     | FileCheck %s --check-prefix=X64
 
 void __vectorcall v1(int a, int b) {}
-// X32: define x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b)
-// X64: define x86_vectorcallcc void @"\01v1@@16"(i32 %a, i32 %b)
+// X32: define dso_local x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b)
+// X64: define dso_local x86_vectorcallcc void @"\01v1@@16"(i32 %a, i32 %b)
 
 void __vectorcall v2(char a, char b) {}
-// X32: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
-// X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
+// X32: define dso_local x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
+// X64: define dso_local x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
 
 struct Small { int x; };
 void __vectorcall v3(int a, struct Small b, int c) {}
-// X32: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, i32 %b.0, i32 inreg %c)
-// X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
+// X32: define dso_local x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, i32 %b.0, i32 inreg %c)
+// X64: define dso_local x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
 
 struct Large { int a[5]; };
 void __vectorcall v4(int a, struct Large b, int c) {}
-// X32: define x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
-// X64: define x86_vectorcallcc void @"\01v4@@40"(i32 %a, %struct.Large* %b, i32 %c)
+// X32: define dso_local x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
+// X64: define dso_local x86_vectorcallcc void @"\01v4@@40"(i32 %a, %struct.Large* %b, i32 %c)
 
 struct HFA2 { double x, y; };
 struct HFA4 { double w, x, y, z; };
 struct HFA5 { double v, w, x, y, z; };
 
 void __vectorcall hfa1(int a, struct HFA4 b, int c) {}
-// X32: define x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, %struct.HFA4 inreg %b.coerce, i32 inreg %c)
-// X64: define x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, %struct.HFA4 inreg %b.coerce, i32 %c)
+// X32: define dso_local x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, %struct.HFA4 inreg %b.coerce, i32 inreg %c)
+// X64: define dso_local x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, %struct.HFA4 inreg %b.coerce, i32 %c)
 
 // HFAs that would require more than six total SSE registers are passed
 // indirectly. Additional vector arguments can consume the rest of the SSE
 // registers.
 void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
-// X32: define x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, %struct.HFA4* inreg %b, double %c)
-// X64: define x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, %struct.HFA4* %b, double %c)
+// X32: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, %struct.HFA4* inreg %b, double %c)
+// X64: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, %struct.HFA4* %b, double %c)
 
 // Ensure that we pass builtin types directly while counting them against the
 // SSE register usage.
 void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
-// X32: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f)
-// X64: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* %f)
+// X32: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f)
+// X64: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* %f)
 
 // Aggregates with more than four elements are not HFAs and are passed byval.
 // Because they are not classified as homogeneous, they don't get special
 // handling to ensure alignment.
 void __vectorcall hfa4(struct HFA5 a) {}
-// X32: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4)
-// X64: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* %a)
+// X32: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4)
+// X64: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* %a)
 
 // Return HFAs of 4 or fewer elements in registers.
 static struct HFA2 g_hfa2;
 struct HFA2 __vectorcall hfa5(void) { return g_hfa2; }
-// X32: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
-// X64: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+// X32: define dso_local x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+// X64: define dso_local x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
 
 typedef float __attribute__((vector_size(16))) v4f32;
 struct HVA2 { v4f32 x, y; };
@@ -60,52 +60,52 @@ struct HVA4 { v4f32 w, x, y, z; };
 struct HVA5 { v4f32 w, x, y, z, p; };
 
 v4f32 __vectorcall hva1(int a, struct HVA4 b, int c) {return b.w;}
-// X32: define x86_vectorcallcc <4 x float> @"\01hva1@@72"(i32 inreg %a, %struct.HVA4 inreg %b.coerce, i32 inreg %c)
-// X64: define x86_vectorcallcc <4 x float> @"\01hva1@@80"(i32 %a, %struct.HVA4 inreg %b.coerce, i32 %c)
+// X32: define dso_local x86_vectorcallcc <4 x float> @"\01hva1@@72"(i32 inreg %a, %struct.HVA4 inreg %b.coerce, i32 inreg %c)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva1@@80"(i32 %a, %struct.HVA4 inreg %b.coerce, i32 %c)
 
 v4f32 __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {return c;}
-// X32: define x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* inreg %b, <4 x float> %c)
-// X64: define x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* %b, <4 x float> %c)
+// X32: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* inreg %b, <4 x float> %c)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* %b, <4 x float> %c)
 
 v4f32 __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {return f.x;}
-// X32: define x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f)
-// X64: define x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f)
+// X32: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f)
 
 // vector types have higher priority then HVA structures, So vector types are allocated first
 // and HVAs are allocated if enough registers are available
 v4f32 __vectorcall hva4(struct HVA4 a, struct HVA2 b, v4f32 c) {return b.y;}
-// X32: define x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, %struct.HVA2* inreg %b, <4 x float> %c)
-// X64: define x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, %struct.HVA2* %b, <4 x float> %c)
+// X32: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, %struct.HVA2* inreg %b, <4 x float> %c)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, %struct.HVA2* %b, <4 x float> %c)
 
 v4f32 __vectorcall hva5(struct HVA3 a, struct HVA3 b, v4f32 c, struct HVA2 d) {return d.y;}
-// X32: define x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, %struct.HVA3* inreg %b, <4 x float> %c, %struct.HVA2 inreg %d.coerce)
-// X64: define x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %d.coerce)
+// X32: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, %struct.HVA3* inreg %b, <4 x float> %c, %struct.HVA2 inreg %d.coerce)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %d.coerce)
 
 struct HVA4 __vectorcall hva6(struct HVA4 a, struct HVA4 b) { return b;}
-// X32: define x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* inreg %b)
-// X64: define x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* %b)
+// X32: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* inreg %b)
+// X64: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* %b)
 
 struct HVA5 __vectorcall hva7() {struct HVA5 a = {}; return a;}
-// X32: define x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* inreg noalias sret %agg.result)
-// X64: define x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* noalias sret %agg.result)
+// X32: define dso_local x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* inreg noalias sret %agg.result)
+// X64: define dso_local x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* noalias sret %agg.result)
 
 v4f32 __vectorcall hva8(v4f32 a, v4f32 b, v4f32 c, v4f32 d, int e, v4f32 f) {return f;}
-// X32: define x86_vectorcallcc <4 x float> @"\01hva8@@84"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 inreg %e, <4 x float> %f)
-// X64: define x86_vectorcallcc <4 x float> @"\01hva8@@88"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f)
+// X32: define dso_local x86_vectorcallcc <4 x float> @"\01hva8@@84"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 inreg %e, <4 x float> %f)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva8@@88"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f)
 
 typedef float __attribute__((ext_vector_type(3))) v3f32;
 struct OddSizeHVA { v3f32 x, y; };
 
 void __vectorcall odd_size_hva(struct OddSizeHVA a) {}
-// X32: define x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce)
-// X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce)
+// X32: define dso_local x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce)
+// X64: define dso_local x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce)
 
 // The Vectorcall ABI only allows passing the first 6 items in registers in x64, so this shouldn't
 // consider 'p7' as a register.  Instead p5 gets put into the register on the second pass.
 // x86 should pass p2, p6 and p7 in registers, then p1 in the second pass.
 struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3, int p4, struct HFA2 p5, float p6, float p7, int p8){ return p1;}
-// X32: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@84"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* inreg %p3, i32 inreg %p4, %struct.HFA2* %p5, float %p6, float %p7, i32 %p8)
-// X64: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@104"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* %p3, i32 %p4, %struct.HFA2 inreg %p5.coerce, float %p6, float %p7, i32 %p8)
+// X32: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@84"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* inreg %p3, i32 inreg %p4, %struct.HFA2* %p5, float %p6, float %p7, i32 %p8)
+// X64: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@104"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* %p3, i32 %p4, %struct.HFA2 inreg %p5.coerce, float %p6, float %p7, i32 %p8)
 
 // Vectorcall in both architectures allows passing of an HVA as long as there is room,
 // even if it is not one of the first 6 arguments.  First pass puts p4 into a
@@ -113,6 +113,6 @@ struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3,
 // in a register, does NOT put p7 in a register (since theres no room), then puts 
 // p8 in a register.
 void __vectorcall HVAAnywhere(struct HFA2 p1, int p2, int p3, float p4, int p5, int p6, struct HFA4 p7, struct HFA2 p8, float p9){}
-// X32: define x86_vectorcallcc void @"\01HVAAnywhere@@88"(%struct.HFA2 inreg %p1.coerce, i32 inreg %p2, i32 inreg %p3, float %p4, i32 %p5, i32 %p6, %struct.HFA4* %p7, %struct.HFA2 inreg %p8.coerce, float %p9)
-// X64: define x86_vectorcallcc void @"\01HVAAnywhere@@112"(%struct.HFA2 inreg %p1.coerce, i32 %p2, i32 %p3, float %p4, i32 %p5, i32 %p6, %struct.HFA4* %p7, %struct.HFA2 inreg %p8.coerce, float %p9) 
+// X32: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@88"(%struct.HFA2 inreg %p1.coerce, i32 inreg %p2, i32 inreg %p3, float %p4, i32 %p5, i32 %p6, %struct.HFA4* %p7, %struct.HFA2 inreg %p8.coerce, float %p9)
+// X64: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@112"(%struct.HFA2 inreg %p1.coerce, i32 %p2, i32 %p3, float %p4, i32 %p5, i32 %p6, %struct.HFA4* %p7, %struct.HFA2 inreg %p8.coerce, float %p9) 
 
diff --git a/test/CodeGen/vla.c b/test/CodeGen/vla.c
index 0f2e2cdc669d..37243cd17290 100644
--- a/test/CodeGen/vla.c
+++ b/test/CodeGen/vla.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,NULL-INVALID
+// RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -fno-delete-null-pointer-checks -o - | FileCheck %s -check-prefixes=CHECK,NULL-VALID
 
 int b(char* x);
 
@@ -202,5 +203,6 @@ void test8(int a[static 3]) { }
 // CHECK: define void @test8(i32* dereferenceable(12) %a)
 
 void test9(int n, int a[static n]) { }
-// CHECK: define void @test9(i32 %n, i32* nonnull %a)
+// NULL-INVALID: define void @test9(i32 %n, i32* nonnull %a)
+// NULL-VALID: define void @test9(i32 %n, i32* %a)
 
diff --git a/test/CodeGen/vld_dup.c b/test/CodeGen/vld_dup.c
deleted file mode 100644
index d910c82966ea..000000000000
--- a/test/CodeGen/vld_dup.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -triple armv7a-linux-gnueabi \
-// RUN:   -target-cpu cortex-a8 \
-// RUN:   -emit-llvm -o - %s | FileCheck %s
-#include <arm_neon.h>
-int main(){
-    int32_t v0[3];
-    int32x2x3_t v1;
-    int32_t v2[4];
-    int32x2x4_t v3;
-    int64x1x3_t v4;
-    int64x1x4_t v5;
-    int64_t v6[3];
-    int64_t v7[4];
-
-    v1 = vld3_dup_s32(v0);
-// CHECK: [[T168:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* {{.*}}, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 {{[0-9]+}}, i32 {{[0-9]+}})
-// CHECK-NEXT: [[T169:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[T168]], 0
-// CHECK-NEXT: [[T170:%.*]] = shufflevector <2 x i32> [[T169]], <2 x i32> [[T169]], <2 x i32> zeroinitializer
-// CHECK-NEXT: [[T171:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[T168]], <2 x i32> [[T170]], 0
-// CHECK-NEXT: [[T172:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[T171]], 1
-// CHECK-NEXT: [[T173:%.*]] = shufflevector <2 x i32> [[T172]], <2 x i32> [[T172]], <2 x i32> zeroinitializer
-// CHECK-NEXT: [[T174:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[T171]], <2 x i32> [[T173]], 1
-// CHECK-NEXT: [[T175:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[T174]], 2
-// CHECK-NEXT: [[T176:%.*]] = shufflevector <2 x i32> [[T175]], <2 x i32> [[T175]], <2 x i32> zeroinitializer
-// CHECK-NEXT: [[T177:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[T174]], <2 x i32> [[T176]], 2
-
-    v3 = vld4_dup_s32(v2);
-// CHECK: [[T178:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* {{.*}}, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 {{[0-9]+}}, i32 {{[0-9]+}})
-// CHECK-NEXT: [[T179:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T178]], 0
-// CHECK-NEXT: [[T180:%.*]] = shufflevector <2 x i32> [[T179]], <2 x i32> [[T179]], <2 x i32> zeroinitializer
-// CHECK-NEXT: [[T181:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T178]], <2 x i32> [[T180]], 0
-// CHECK-NEXT: [[T182:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T181]], 1
-// CHECK-NEXT: [[T183:%.*]] = shufflevector <2 x i32> [[T182]], <2 x i32> [[T182]], <2 x i32> zeroinitializer
-// CHECK-NEXT: [[T184:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T181]], <2 x i32> [[T183]], 1
-// CHECK-NEXT: [[T185:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T184]], 2
-// CHECK-NEXT: [[T186:%.*]] = shufflevector <2 x i32> [[T185]], <2 x i32> [[T185]], <2 x i32> zeroinitializer
-// CHECK-NEXT: [[T187:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T184]], <2 x i32> [[T186]], 2
-// CHECK-NEXT: [[T188:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T187]], 3
-// CHECK-NEXT: [[T189:%.*]] = shufflevector <2 x i32> [[T188]], <2 x i32> [[T188]], <2 x i32> zeroinitializer
-// CHECK-NEXT: [[T190:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[T187]], <2 x i32> [[T189]], 3
-
-    v4 = vld3_dup_s64(v6);
-// CHECK: {{%.*}} = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* {{.*}}, i32 {{[0-9]+}})
-
-    v5 = vld4_dup_s64(v7);
-// CHECK: {{%.*}} = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* {{.*}}, i32 {{[0-9]+}})
-
-    return 0;
-}
diff --git a/test/CodeGen/volatile-1.c b/test/CodeGen/volatile-1.c
index f63274b37f61..c62a8fd4cd3d 100644
--- a/test/CodeGen/volatile-1.c
+++ b/test/CodeGen/volatile-1.c
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -Wno-return-type -Wno-unused-value -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -Wno-return-type -Wno-unused-value -emit-llvm %s -w -o - | FileCheck %s
 
-// CHECK: @i = common global [[INT:i[0-9]+]] 0
+// CHECK: @i = common {{(dso_local )?}}global [[INT:i[0-9]+]] 0
 volatile int i, j, k;
 volatile int ar[5];
 volatile char c;
-// CHECK: @ci = common global [[CINT:.*]] zeroinitializer
+// CHECK: @ci = common {{(dso_local )?}}global [[CINT:.*]] zeroinitializer
 volatile _Complex int ci;
 volatile struct S {
 #ifdef __cplusplus
diff --git a/test/CodeGen/volatile.c b/test/CodeGen/volatile.c
index 52915f6f4d5b..0f58bb62a248 100644
--- a/test/CodeGen/volatile.c
+++ b/test/CodeGen/volatile.c
@@ -199,7 +199,7 @@ int main() {
 // CHECK: store volatile i32 {{.*}}, i32* @vtS
   (void)vF2;
   // From vF2 to a temporary
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* %{{.*}}, i8* {{.*}} @vF2 {{.*}}, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* {{.*}} @vF2 {{.*}}, i1 true)
   vF2 = vF2;
   // vF2 to itself
 // CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
@@ -209,6 +209,6 @@ int main() {
 // CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
   vF2 = (vF2, vF2);
   // vF2 to a temporary, then vF2 to itself
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* %{{.*}}, i8* {{.*@vF2.*}}, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* {{.*@vF2.*}}, i1 true)
 // CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
 }
diff --git a/test/CodeGen/vpclmulqdq-builtins.c b/test/CodeGen/vpclmulqdq-builtins.c
index 8c610e2d851c..15eef399bcdf 100644
--- a/test/CodeGen/vpclmulqdq-builtins.c
+++ b/test/CodeGen/vpclmulqdq-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - | FileCheck %s --check-prefix AVX
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -DAVX512 -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512
 
 #include <immintrin.h>
 
@@ -8,7 +8,7 @@ __m256i test_mm256_clmulepi64_epi128(__m256i A, __m256i B) {
   return _mm256_clmulepi64_epi128(A, B, 0);
 }
 
-#ifdef AVX512
+#ifdef __AVX512F__
 __m512i test_mm512_clmulepi64_epi128(__m512i A, __m512i B) {
   // AVX512: @llvm.x86.pclmulqdq.512
   return _mm512_clmulepi64_epi128(A, B, 0);
diff --git a/test/CodeGen/waitpkg.c b/test/CodeGen/waitpkg.c
new file mode 100644
index 000000000000..250e74fdd9fc
--- /dev/null
+++ b/test/CodeGen/waitpkg.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple x86_64-unknown-unknown -emit-llvm -target-feature +waitpkg -Wall -pedantic -o - | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple i386-unknown-unknown -emit-llvm -target-feature +waitpkg -Wall -pedantic -o - | FileCheck %s
+
+#include <immintrin.h>
+
+#include <stddef.h>
+#include <stdint.h>
+
+void test_umonitor(void *address) {
+  //CHECK-LABEL: @test_umonitor
+  //CHECK: call void @llvm.x86.umonitor(i8* %{{.*}})
+  return _umonitor(address);
+}
+
+uint8_t test_umwait(uint32_t control, uint64_t counter) {
+  //CHECK-LABEL: @test_umwait
+  //CHECK: call i8 @llvm.x86.umwait(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  return _umwait(control, counter);
+}
+
+uint8_t test_tpause(uint32_t control, uint64_t counter) {
+  //CHECK-LABEL: @test_tpause
+  //CHECK: call i8 @llvm.x86.tpause(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  return _tpause(control, counter);
+}
diff --git a/test/CodeGen/wasm-arguments.c b/test/CodeGen/wasm-arguments.c
index 723632b62f6b..9283dd5d15f9 100644
--- a/test/CodeGen/wasm-arguments.c
+++ b/test/CodeGen/wasm-arguments.c
@@ -24,7 +24,7 @@ typedef struct {
 // Single-element structs should be returned as the one element.
 // WEBASSEMBLY32: define i32 @f2()
 // WEBASSEMBLY64: define i32 @f2()
-s2 f2() {
+s2 f2(void) {
   s2 foo;
   return foo;
 }
@@ -36,7 +36,7 @@ typedef struct {
 // Structs should be returned sret and not simplified by the frontend.
 // WEBASSEMBLY32: define void @f3(%struct.s3* noalias sret %agg.result)
 // WEBASSEMBLY64: define void @f3(%struct.s3* noalias sret %agg.result)
-s3 f3() {
+s3 f3(void) {
   s3 foo;
   return foo;
 }
diff --git a/test/CodeGen/wasm-varargs.c b/test/CodeGen/wasm-varargs.c
index b8e488ec1440..acbd590cc25c 100644
--- a/test/CodeGen/wasm-varargs.c
+++ b/test/CodeGen/wasm-varargs.c
@@ -80,10 +80,9 @@ struct S test_struct(char *fmt, ...) {
   return v;
 }
 
-// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret %agg.result, i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret [[AGG_RESULT:%.*]], i8*{{.*}} %fmt, ...) {{.*}} {
 // CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
 // CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
-// CHECK:   [[V:%[^,=]+]] = alloca [[STRUCT_S]], align 4
 // CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
 // CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
 // CHECK:   call void @llvm.va_start(i8* [[VA1]])
@@ -91,13 +90,10 @@ struct S test_struct(char *fmt, ...) {
 // CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 12
 // CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
 // CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to [[STRUCT_S]]*
-// CHECK:   [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
+// CHECK:   [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[AGG_RESULT]] to i8*
 // CHECK:   [[R5:%[^,=]+]] = bitcast [[STRUCT_S]]* [[R3]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R4]], i8* [[R5]], i32 12, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[R4]], i8* align 4 [[R5]], i32 12, i1 false)
 // CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
 // CHECK:   call void @llvm.va_end(i8* [[VA2]])
-// CHECK:   [[R6:%[^,=]+]] = bitcast [[STRUCT_S]]* %agg.result to i8*
-// CHECK:   [[R7:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R6]], i8* [[R7]], i32 12, i32 4, i1 false)
 // CHECK:   ret void
 // CHECK: }
diff --git a/test/CodeGen/wchar-const.c b/test/CodeGen/wchar-const.c
index b461d605f6a9..91b14ecc1024 100644
--- a/test/CodeGen/wchar-const.c
+++ b/test/CodeGen/wchar-const.c
@@ -15,7 +15,7 @@ typedef __WCHAR_TYPE__ wchar_t;
 
 
 // CHECK-DAR: private unnamed_addr constant [18 x i32] [i32 84,
-// CHECK-WIN: linkonce_odr unnamed_addr constant [18 x i16] [i16 84,
+// CHECK-WIN: linkonce_odr dso_local unnamed_addr constant [18 x i16] [i16 84,
 extern void foo(const wchar_t* p);
 int main (int argc, const char * argv[])
 {
diff --git a/test/CodeGen/windows-itanium.c b/test/CodeGen/windows-itanium.c
index 7f0e7b135d2b..5bcd6dc710c1 100644
--- a/test/CodeGen/windows-itanium.c
+++ b/test/CodeGen/windows-itanium.c
@@ -8,8 +8,8 @@ int function() {
   return 32;
 }
 
-// CHECK-C: define i32 @function() {{.*}} {
-// CHECK-CXX: define i32 @_Z8functionv() {{.*}} {
+// CHECK-C: define dso_local i32 @function() {{.*}} {
+// CHECK-CXX: define dso_local i32 @_Z8functionv() {{.*}} {
 // CHECK:   ret i32 32
 // CHECK: }
 
diff --git a/test/CodeGen/windows-on-arm-dllimport-dllexport.c b/test/CodeGen/windows-on-arm-dllimport-dllexport.c
index 4b2e29e430c6..a2ebbf195f79 100644
--- a/test/CodeGen/windows-on-arm-dllimport-dllexport.c
+++ b/test/CodeGen/windows-on-arm-dllimport-dllexport.c
@@ -17,9 +17,9 @@ void call_imported_function() {
 }
 
 // CHECK: @import_int = external dllimport global i32
-// CHECK: @export_int = common dllexport global i32 0, align 4
+// CHECK: @export_int = common dso_local dllexport global i32 0, align 4
 
-// CHECK: define dllexport arm_aapcs_vfpcc void @export_implemented_function()
+// CHECK: define dso_local dllexport arm_aapcs_vfpcc void @export_implemented_function()
 
 // CHECK: declare dllimport arm_aapcs_vfpcc void @import_function(i32)
 
diff --git a/test/CodeGen/windows-struct-abi.c b/test/CodeGen/windows-struct-abi.c
index 1631f61db90c..5ffc4fad6473 100644
--- a/test/CodeGen/windows-struct-abi.c
+++ b/test/CodeGen/windows-struct-abi.c
@@ -6,11 +6,11 @@ struct f1 {
 
 struct f1 return_f1(void) { while (1); }
 
-// CHECK: define i32 @return_f1()
+// CHECK: define dso_local i32 @return_f1()
 
 void receive_f1(struct f1 a0) { }
 
-// CHECK: define void @receive_f1(float %a0.0)
+// CHECK: define dso_local void @receive_f1(float %a0.0)
 
 struct f2 {
   float f;
@@ -19,11 +19,11 @@ struct f2 {
 
 struct f2 return_f2(void) { while (1); }
 
-// CHECK: define i64 @return_f2()
+// CHECK: define dso_local i64 @return_f2()
 
 void receive_f2(struct f2 a0) { }
 
-// CHECK: define void @receive_f2(float %a0.0, float %a0.1)
+// CHECK: define dso_local void @receive_f2(float %a0.0, float %a0.1)
 
 struct f4 {
   float f;
@@ -34,9 +34,9 @@ struct f4 {
 
 struct f4 return_f4(void) { while (1); }
 
-// CHECK: define void @return_f4(%struct.f4* noalias sret %agg.result)
+// CHECK: define dso_local void @return_f4(%struct.f4* noalias sret %agg.result)
 
 void receive_f4(struct f4 a0) { }
 
-// CHECK: define void @receive_f4(float %a0.0, float %a0.1, float %a0.2, float %a0.3)
+// CHECK: define dso_local void @receive_f4(float %a0.0, float %a0.1, float %a0.2, float %a0.3)
 
diff --git a/test/CodeGen/windows-swiftcall.c b/test/CodeGen/windows-swiftcall.c
index f76b2fd3a40f..d8fdec47aa24 100644
--- a/test/CodeGen/windows-swiftcall.c
+++ b/test/CodeGen/windows-swiftcall.c
@@ -40,7 +40,7 @@ void test_context_error_1() {
   float *error;
   context_error_1(&x, &error);
 }
-// CHECK-LABEL: define void @test_context_error_1()
+// CHECK-LABEL: define dso_local void @test_context_error_1()
 // CHECK:       [[X:%.*]] = alloca i32, align 4
 // CHECK:       [[ERROR:%.*]] = alloca float*, align 8
 // CHECK:       [[TEMP:%.*]] = alloca swifterror float*, align 8
@@ -97,11 +97,9 @@ typedef struct {
   float f1;
 } struct_1;
 TEST(struct_1);
-// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_1() {{.*}}{
+// CHECK-LABEL: define dso_local swiftcc { i64, i64 } @return_struct_1() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT1:%.*]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT1]], align 4
 // CHECK:   call void @llvm.memset
-// CHECK:   call void @llvm.memcpy
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* %retval to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -111,7 +109,7 @@ TEST(struct_1);
 // CHECK:   [[R1:%.*]] = insertvalue { i64, i64 } [[R0]], i64 [[T1]], 1
 // CHECK:   ret { i64, i64 } [[R1]]
 // CHECK: }
-// CHECK-LABEL: define swiftcc void @take_struct_1(i64, i64) {{.*}}{
+// CHECK-LABEL: define dso_local swiftcc void @take_struct_1(i64, i64) {{.*}}{
 // CHECK:   [[V:%.*]] = alloca [[STRUCT1:%.*]], align 4
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* [[V]] to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
@@ -120,7 +118,7 @@ TEST(struct_1);
 // CHECK:   store i64 %1, i64* [[GEP1]], align 4
 // CHECK:   ret void
 // CHECK: }
-// CHECK-LABEL: define void @test_struct_1() {{.*}}{
+// CHECK-LABEL: define dso_local void @test_struct_1() {{.*}}{
 // CHECK:   [[AGG:%.*]] = alloca [[STRUCT1:%.*]], align 4
 // CHECK:   [[RET:%.*]] = call swiftcc { i64, i64 } @return_struct_1()
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT1]]* [[AGG]] to { i64, i64 }*
@@ -147,14 +145,10 @@ typedef struct {
   float f1;
 } struct_2;
 TEST(struct_2);
-// CHECK-LABEL: define swiftcc { i64, i64 } @return_struct_2() {{.*}}{
+// CHECK-LABEL: define dso_local swiftcc { i64, i64 } @return_struct_2() {{.*}}{
 // CHECK:   [[RET:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[VAR:%.*]] = alloca [[STRUCT2_TYPE]], align 4
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
+// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[RET]]
 // CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTVAR]], {{.*}}[[STRUCT2_RESULT]]
-// CHECK:   [[CASTRET:%.*]] = bitcast {{.*}} [[RET]]
-// CHECK:   [[CASTVAR:%.*]] = bitcast {{.*}} [[VAR]]
-// CHECK:   call void @llvm.memcpy{{.*}}({{.*}}[[CASTRET]], {{.*}}[[CASTVAR]]
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT2_TYPE]]* [[RET]] to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   [[T0:%.*]] = load i64, i64* [[GEP0]], align 4
@@ -164,7 +158,7 @@ TEST(struct_2);
 // CHECK:   [[R1:%.*]] = insertvalue { i64, i64 } [[R0]], i64 [[T1]], 1
 // CHECK:   ret { i64, i64 } [[R1]]
 // CHECK: }
-// CHECK-LABEL: define swiftcc void @take_struct_2(i64, i64) {{.*}}{
+// CHECK-LABEL: define dso_local swiftcc void @take_struct_2(i64, i64) {{.*}}{
 // CHECK:   [[V:%.*]] = alloca [[STRUCT:%.*]], align 4
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT]]* [[V]] to { i64, i64 }*
 // CHECK:   [[GEP0:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[CAST]], i32 0, i32 0
@@ -173,7 +167,7 @@ TEST(struct_2);
 // CHECK:   store i64 %1, i64* [[GEP1]], align 4
 // CHECK:   ret void
 // CHECK: }
-// CHECK-LABEL: define void @test_struct_2() {{.*}} {
+// CHECK-LABEL: define dso_local void @test_struct_2() {{.*}} {
 // CHECK:   [[TMP:%.*]] = alloca [[STRUCT2_TYPE]], align 4
 // CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_struct_2()
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[STRUCT2_TYPE]]* [[TMP]] to { i64, i64 }*
@@ -203,27 +197,23 @@ typedef struct {
   __attribute__((packed)) float f;
 } struct_misaligned_1;
 TEST(struct_misaligned_1)
-// CHECK-LABEL: define swiftcc i64 @return_struct_misaligned_1()
+// CHECK-LABEL: define dso_local swiftcc i64 @return_struct_misaligned_1()
 // CHECK:  [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
-// CHECK:  [[RES:%.*]] = alloca [[STRUCT]], align 1
-// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
-// CHECK:  [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
+// CHECK:  call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
 // CHECK:  [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 1
 // CHECK:  ret i64 [[R0]]
 // CHECK:}
-// CHECK-LABEL: define swiftcc void @take_struct_misaligned_1(i64) {{.*}}{
+// CHECK-LABEL: define dso_local swiftcc void @take_struct_misaligned_1(i64) {{.*}}{
 // CHECK:   [[V:%.*]] = alloca [[STRUCT:%.*]], align 1
 // CHECK:   [[CAST:%.*]] = bitcast [[STRUCT]]* [[V]] to { i64 }*
 // CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   store i64 %0, i64* [[GEP]], align 1
 // CHECK:   ret void
 // CHECK: }
-// CHECK: define void @test_struct_misaligned_1() {{.*}}{
+// CHECK: define dso_local void @test_struct_misaligned_1() {{.*}}{
 // CHECK:   [[AGG:%.*]] = alloca [[STRUCT:%.*]], align 1
 // CHECK:   [[CALL:%.*]] = call swiftcc i64 @return_struct_misaligned_1()
 // CHECK:   [[T0:%.*]] = bitcast [[STRUCT]]* [[AGG]] to { i64 }*
@@ -256,26 +246,22 @@ typedef union {
   double d;
 } union_het_fp;
 TEST(union_het_fp)
-// CHECK-LABEL: define swiftcc i64 @return_union_het_fp()
+// CHECK-LABEL: define dso_local swiftcc i64 @return_union_het_fp()
 // CHECK:  [[RET:%.*]] = alloca [[UNION:%.*]], align 8
-// CHECK:  [[RES:%.*]] = alloca [[UNION]], align 8
-// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CAST]]
-// CHECK:  [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
-// CHECK:  [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK:  call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
+// CHECK:  call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CAST]]
 // CHECK:  [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
 // CHECK:  [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:  [[R0:%.*]] = load i64, i64* [[GEP]], align 8
 // CHECK:  ret i64 [[R0]]
-// CHECK-LABEL: define swiftcc void @take_union_het_fp(i64) {{.*}}{
+// CHECK-LABEL: define dso_local swiftcc void @take_union_het_fp(i64) {{.*}}{
 // CHECK:   [[V:%.*]] = alloca [[UNION:%.*]], align 8
 // CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[V]] to { i64 }*
 // CHECK:   [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
 // CHECK:   store i64 %0, i64* [[GEP]], align 8
 // CHECK:   ret void
 // CHECK: }
-// CHECK-LABEL: define void @test_union_het_fp() {{.*}}{
+// CHECK-LABEL: define dso_local void @test_union_het_fp() {{.*}}{
 // CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 8
 // CHECK:   [[CALL:%.*]] = call swiftcc i64 @return_union_het_fp()
 // CHECK:   [[T0:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64 }*
@@ -294,7 +280,7 @@ typedef union {
   float f2;
 } union_hom_fp;
 TEST(union_hom_fp)
-// CHECK-LABEL: define void @test_union_hom_fp()
+// CHECK-LABEL: define dso_local void @test_union_hom_fp()
 // CHECK:   [[TMP:%.*]] = alloca [[REC:%.*]], align 4
 // CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] float @return_union_hom_fp()
 // CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG:{ float }]]*
@@ -311,7 +297,7 @@ typedef union {
   float4 fv2;
 } union_hom_fp_partial;
 TEST(union_hom_fp_partial)
-// CHECK: define void @test_union_hom_fp_partial()
+// CHECK: define dso_local void @test_union_hom_fp_partial()
 // CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 16
 // CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_union_hom_fp_partial()
 // CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
@@ -335,7 +321,7 @@ typedef union {
   float4 fv2;
 } union_het_fpv_partial;
 TEST(union_het_fpv_partial)
-// CHECK-LABEL: define void @test_union_het_fpv_partial()
+// CHECK-LABEL: define dso_local void @test_union_het_fpv_partial()
 // CHECK:   [[AGG:%.*]] = alloca [[UNION:%.*]], align 16
 // CHECK:   [[CALL:%.*]] = call swiftcc { i64, i64 } @return_union_het_fpv_partial()
 // CHECK:   [[CAST:%.*]] = bitcast [[UNION]]* [[AGG]] to { i64, i64 }*
@@ -385,7 +371,7 @@ TEST(int8)
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
 // CHECK:   store <4 x i32> %1, <4 x i32>* [[T0]], align
 // CHECK:   ret void
-// CHECK-LABEL: define void @test_int8()
+// CHECK-LABEL: define dso_local void @test_int8()
 // CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
 // CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
 // CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int8()
@@ -429,7 +415,7 @@ TEST(int5)
 // CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
 // CHECK:   store i32 %1, i32* [[T0]], align
 // CHECK:   ret void
-// CHECK-LABEL: define void @test_int5()
+// CHECK-LABEL: define dso_local void @test_int5()
 // CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
 // CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
 // CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int5()
@@ -455,4 +441,4 @@ typedef struct {
   int3 v __attribute__((packed));
 } misaligned_int3;
 TEST(misaligned_int3)
-// CHECK-LABEL: define swiftcc void @take_misaligned_int3(i64, i64)
+// CHECK-LABEL: define dso_local swiftcc void @take_misaligned_int3(i64, i64)
diff --git a/test/CodeGen/wmemcmp.c b/test/CodeGen/wmemcmp.c
new file mode 100644
index 000000000000..ad0886192bcf
--- /dev/null
+++ b/test/CodeGen/wmemcmp.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -emit-llvm -o - | FileCheck %s
+
+typedef __SIZE_TYPE__ size_t;
+typedef __WCHAR_TYPE__ wchar_t;
+
+int wmemcmp_test(const wchar_t *s1, const wchar_t *s2, size_t n) {
+  // CHECK: [[S1:%.*]] = load
+  // CHECK: [[S2:%.*]] = load
+  // CHECK: [[N:%.*]] = load
+  // CHECK: [[N0:%.*]] = icmp eq i64 [[N]], 0
+  // CHECK: br i1 [[N0]], label %[[EXIT:.*]], label %[[GT:.*]]
+
+  // CHECK: [[GT]]:
+  // CHECK: [[S1P:%.*]] = phi i16* [ [[S1]], %[[ENTRY:.*]] ], [ [[S1N:.*]], %[[NEXT:.*]] ]
+  // CHECK: [[S2P:%.*]] = phi i16* [ [[S2]], %[[ENTRY]] ], [ [[S2N:.*]], %[[NEXT]] ]
+  // CHECK: [[NP:%.*]] = phi i64 [ [[N]], %[[ENTRY]] ], [ [[NN:.*]], %[[NEXT]] ]
+  // CHECK: [[S1L:%.*]] = load i16, i16* [[S1P]], align 2
+  // CHECK: [[S2L:%.*]] = load i16, i16* [[S2P]], align 2
+  // CHECK: [[CMPGT:%.*]] = icmp ugt i16 [[S1L]], [[S2L]]
+  // CHECK: br i1 [[CMPGT]], label %[[EXIT]], label %[[LT:.*]]
+
+  // CHECK: [[LT]]:
+  // CHECK: [[CMPLT:%.*]] = icmp ult i16 [[S1L]], [[S2L]]
+  // CHECK: br i1 [[CMPLT]], label %[[EXIT]], label %[[NEXT:.*]]
+
+  // CHECK: [[NEXT]]:
+  // CHECK: [[S1N]] = getelementptr inbounds i16, i16* [[S1P]], i32 1
+  // CHECK: [[S2N]] = getelementptr inbounds i16, i16* [[S2P]], i32 1
+  // CHECK: [[NN]] = sub i64 [[NP]], 1
+  // CHECK: [[NN0:%.*]] = icmp eq i64 [[NN]], 0
+  // CHECK: br i1 [[NN0]], label %[[EXIT]], label %[[GT]]
+  //
+  // CHECK: [[EXIT]]:
+  // CHECK: [[RV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[GT]] ], [ -1, %[[LT]] ], [ 0, %[[NEXT]] ]
+  // CHECK: ret i32 [[RV]]
+  return __builtin_wmemcmp(s1, s2, n);
+}
diff --git a/test/CodeGen/x86-atomic-long_double.c b/test/CodeGen/x86-atomic-long_double.c
index 9857c67592f2..130ff45c0346 100644
--- a/test/CodeGen/x86-atomic-long_double.c
+++ b/test/CodeGen/x86-atomic-long_double.c
@@ -15,12 +15,12 @@ long double testinc(_Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[INC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -46,10 +46,10 @@ long double testinc(_Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -77,12 +77,12 @@ long double testdec(_Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -108,10 +108,10 @@ long double testdec(_Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -140,12 +140,12 @@ long double testcompassign(_Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[SUB_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -177,10 +177,10 @@ long double testcompassign(_Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -203,7 +203,7 @@ long double testassign(_Atomic long double *addr) {
   // CHECK: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 8
   // CHECK: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 8
   // CHECK: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 16
   // CHECK: [[STORE_TEMP_INT_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i128*
   // CHECK: [[STORE_TEMP_INT:%.+]] = load i128, i128* [[STORE_TEMP_INT_PTR]], align 16
@@ -213,7 +213,7 @@ long double testassign(_Atomic long double *addr) {
   // CHECK32: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 4
   // CHECK32: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 4
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 4
   // CHECK32: [[ADDR_VOID:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i8*
@@ -250,12 +250,12 @@ long double test_volatile_inc(volatile _Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[INC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -281,10 +281,10 @@ long double test_volatile_inc(volatile _Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -311,12 +311,12 @@ long double test_volatile_dec(volatile _Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -342,10 +342,10 @@ long double test_volatile_dec(volatile _Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -373,12 +373,12 @@ long double test_volatile_compassign(volatile _Atomic long double *addr) {
   // CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
   // CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
   // CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
   // CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 [[SUB_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
   // CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
   // CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
@@ -409,10 +409,10 @@ long double test_volatile_compassign(volatile _Atomic long double *addr) {
   // CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
   // CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
   // CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
   // CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
   // CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
@@ -435,7 +435,7 @@ long double test_volatile_assign(volatile _Atomic long double *addr) {
   // CHECK: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 8
   // CHECK: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 8
   // CHECK: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i32 16, i1 false)
+  // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i1 false)
   // CHECK: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 16
   // CHECK: [[STORE_TEMP_INT_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i128*
   // CHECK: [[STORE_TEMP_INT:%.+]] = load i128, i128* [[STORE_TEMP_INT_PTR]], align 16
@@ -445,7 +445,7 @@ long double test_volatile_assign(volatile _Atomic long double *addr) {
   // CHECK32: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 4
   // CHECK32: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 4
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
-  // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i32 4, i1 false)
+  // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i1 false)
   // CHECK32: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 4
   // CHECK32: [[ADDR_VOID:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
   // CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i8*
diff --git a/test/CodeGen/x86-builtins-vector-width.c b/test/CodeGen/x86-builtins-vector-width.c
new file mode 100644
index 000000000000..62f5929e4fda
--- /dev/null
+++ b/test/CodeGen/x86-builtins-vector-width.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu i686 -emit-llvm %s -o - | FileCheck %s
+
+typedef signed long long V2LLi __attribute__((vector_size(16)));
+typedef signed long long V4LLi __attribute__((vector_size(32)));
+
+// Make sure builtin forces a min-legal-width attribute
+void foo(void) {
+  V2LLi  tmp_V2LLi;
+
+  tmp_V2LLi = __builtin_ia32_undef128();
+}
+
+// Make sure explicit attribute larger than builtin wins.
+void goo(void) __attribute__((__min_vector_width__(256))) {
+  V2LLi  tmp_V2LLi;
+
+  tmp_V2LLi = __builtin_ia32_undef128();
+}
+
+// Make sure builtin larger than explicit attribute wins.
+void hoo(void) __attribute__((__min_vector_width__(128))) {
+  V4LLi  tmp_V4LLi;
+
+  tmp_V4LLi = __builtin_ia32_undef256();
+}
+
+// CHECK: foo{{.*}} #0
+// CHECK: goo{{.*}} #1
+// CHECK: hoo{{.*}} #1
+
+// CHECK: #0 = {{.*}}"min-legal-vector-width"="128"
+// CHECK: #1 = {{.*}}"min-legal-vector-width"="256"
diff --git a/test/CodeGen/x86-cf-protection.c b/test/CodeGen/x86-cf-protection.c
new file mode 100644
index 000000000000..7197dacc4ba0
--- /dev/null
+++ b/test/CodeGen/x86-cf-protection.c
@@ -0,0 +1,8 @@
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=return %s | FileCheck %s --check-prefix=RETURN
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=branch %s | FileCheck %s --check-prefix=BRANCH
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=full %s   | FileCheck %s --check-prefix=FULL
+
+// RETURN: #define __CET__ 2
+// BRANCH: #define __CET__ 1
+// FULL: #define __CET__ 3
+void foo() {}
diff --git a/test/CodeGen/x86-nontemporal.c b/test/CodeGen/x86-nontemporal.c
index 5e9e42c9f204..cb72f41950a8 100644
--- a/test/CodeGen/x86-nontemporal.c
+++ b/test/CodeGen/x86-nontemporal.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -target-feature +avx -target-feature +avx2 -target-feature +avx512f -emit-llvm -o - -Wall -Werror -fmax-type-align=16 | FileCheck %s --check-prefix=CHECK
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -target-feature +avx -target-feature +avx2 -target-feature +avx512f -fno-signed-char -emit-llvm -o - -Wall -Werror -fmax-type-align=16 | FileCheck %s --check-prefix=CHECK
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 // (PR33830) Tests ensure the correct alignment of non-temporal load/stores on darwin targets where fmax-type-align is set to 16.
 
diff --git a/test/CodeGen/x86_32-arguments-realign.c b/test/CodeGen/x86_32-arguments-realign.c
index 768e1cc4690f..b99523b7eee6 100644
--- a/test/CodeGen/x86_32-arguments-realign.c
+++ b/test/CodeGen/x86_32-arguments-realign.c
@@ -2,7 +2,7 @@
 // RUN: FileCheck < %t %s
 
 // CHECK-LABEL: define void @f0(%struct.s0* byval align 4)
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* %{{.*}}, i32 16, i32 4, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %{{.*}}, i8* align 4 %{{.*}}, i32 16, i1 false)
 // CHECK: }
 struct s0 { long double a; };
 void f0(struct s0 a0) {
diff --git a/test/CodeGen/x86_32-arguments-win32.c b/test/CodeGen/x86_32-arguments-win32.c
index 7b27fc746477..65e25f32c250 100644
--- a/test/CodeGen/x86_32-arguments-win32.c
+++ b/test/CodeGen/x86_32-arguments-win32.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s
 
-// CHECK-LABEL: define i64 @f1_1()
-// CHECK-LABEL: define void @f1_2(i32 %a0.0, i32 %a0.1)
+// CHECK-LABEL: define dso_local i64 @f1_1()
+// CHECK-LABEL: define dso_local void @f1_2(i32 %a0.0, i32 %a0.1)
 struct s1 {
   int a;
   int b;
@@ -9,37 +9,37 @@ struct s1 {
 struct s1 f1_1(void) { while (1) {} }
 void f1_2(struct s1 a0) {}
 
-// CHECK-LABEL: define i32 @f2_1()
+// CHECK-LABEL: define dso_local i32 @f2_1()
 struct s2 {
   short a;
   short b;
 };
 struct s2 f2_1(void) { while (1) {} }
 
-// CHECK-LABEL: define i16 @f3_1()
+// CHECK-LABEL: define dso_local i16 @f3_1()
 struct s3 {
   char a;
   char b;
 };
 struct s3 f3_1(void) { while (1) {} }
 
-// CHECK-LABEL: define i8 @f4_1()
+// CHECK-LABEL: define dso_local i8 @f4_1()
 struct s4 {
   char a:4;
   char b:4;
 };
 struct s4 f4_1(void) { while (1) {} }
 
-// CHECK-LABEL: define i64 @f5_1()
-// CHECK-LABEL: define void @f5_2(double %a0.0)
+// CHECK-LABEL: define dso_local i64 @f5_1()
+// CHECK-LABEL: define dso_local void @f5_2(double %a0.0)
 struct s5 {
   double a;
 };
 struct s5 f5_1(void) { while (1) {} }
 void f5_2(struct s5 a0) {}
 
-// CHECK-LABEL: define i32 @f6_1()
-// CHECK-LABEL: define void @f6_2(float %a0.0)
+// CHECK-LABEL: define dso_local i32 @f6_1()
+// CHECK-LABEL: define dso_local void @f6_2(float %a0.0)
 struct s6 {
   float a;
 };
diff --git a/test/CodeGen/x86_32-fpcc-struct-return.c b/test/CodeGen/x86_32-fpcc-struct-return.c
index 9f61599fc695..2cfd4378c347 100644
--- a/test/CodeGen/x86_32-fpcc-struct-return.c
+++ b/test/CodeGen/x86_32-fpcc-struct-return.c
@@ -17,15 +17,15 @@ typedef struct { } ZeroSized;
 // CHECK: ret void
 Big returnBig(Big x) { return x; }
 
-// CHECK-PCC-LABEL: define void @returnSmall
+// CHECK-PCC-LABEL: define {{(dso_local )?}}void @returnSmall
 // CHECK-PCC: ret void
-// CHECK-REG-LABEL: define i32 @returnSmall
+// CHECK-REG-LABEL: define {{(dso_local )?}}i32 @returnSmall
 // CHECK-REG: ret i32
 Small returnSmall(Small x) { return x; }
 
-// CHECK-PCC-LABEL: define void @returnShort
+// CHECK-PCC-LABEL: define {{(dso_local )?}}void @returnShort
 // CHECK-PCC: ret void
-// CHECK-REG-LABEL: define i16 @returnShort
+// CHECK-REG-LABEL: define {{(dso_local )?}}i16 @returnShort
 // CHECK-REG: ret i16
 Short returnShort(Short x) { return x; }
 
diff --git a/test/CodeGen/x86_64-arguments-win32.c b/test/CodeGen/x86_64-arguments-win32.c
index 7731eadd01ea..b43107c65ef6 100644
--- a/test/CodeGen/x86_64-arguments-win32.c
+++ b/test/CodeGen/x86_64-arguments-win32.c
@@ -3,29 +3,29 @@
 // To be ABI compatible with code generated by MSVC, there shouldn't be any
 // sign/zero extensions on types smaller than 64bit.
 
-// CHECK-LABEL: define void @f1(i8 %a)
+// CHECK-LABEL: define dso_local void @f1(i8 %a)
 void f1(char a) {}
 
-// CHECK-LABEL: define void @f2(i8 %a)
+// CHECK-LABEL: define dso_local void @f2(i8 %a)
 void f2(unsigned char a) {}
 
-// CHECK-LABEL: define void @f3(i16 %a)
+// CHECK-LABEL: define dso_local void @f3(i16 %a)
 void f3(short a) {}
 
-// CHECK-LABEL: define void @f4(i16 %a)
+// CHECK-LABEL: define dso_local void @f4(i16 %a)
 void f4(unsigned short a) {}
 
 // For ABI compatibility with ICC, _Complex should be passed/returned
 // as if it were a struct with two elements.
 
-// CHECK-LABEL: define void @f5(i64 %a.coerce)
+// CHECK-LABEL: define dso_local void @f5(i64 %a.coerce)
 void f5(_Complex float a) {}
 
-// CHECK-LABEL: define void @f6({ double, double }* %a)
+// CHECK-LABEL: define dso_local void @f6({ double, double }* %a)
 void f6(_Complex double a) {}
 
-// CHECK-LABEL: define i64 @f7()
+// CHECK-LABEL: define dso_local i64 @f7()
 _Complex float f7() { return 1.0; }
 
-// CHECK-LABEL: define void @f8({ double, double }* noalias sret %agg.result)
+// CHECK-LABEL: define dso_local void @f8({ double, double }* noalias sret %agg.result)
 _Complex double f8() { return 1.0; }
diff --git a/test/CodeGen/x86_64-arguments.c b/test/CodeGen/x86_64-arguments.c
index d24ea4dbab3d..548980b32ae4 100644
--- a/test/CodeGen/x86_64-arguments.c
+++ b/test/CodeGen/x86_64-arguments.c
@@ -434,7 +434,7 @@ void test51(struct test51_s *s, __builtin_va_list argList) {
 // CHECK-NEXT: [[CASTED_VALUE_ADDR:%.*]] = bitcast i8* [[VALUE_ADDR]] to [[STRUCT_TEST51]]
 // CHECK-NEXT: [[CASTED_TMP_ADDR:%.*]] = bitcast [[STRUCT_TEST51]]* [[TMP_ADDR]] to i8*
 // CHECK-NEXT: [[RECASTED_VALUE_ADDR:%.*]] = bitcast [[STRUCT_TEST51]]* [[CASTED_VALUE_ADDR]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[CASTED_TMP_ADDR]], i8* [[RECASTED_VALUE_ADDR]], i64 16, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[CASTED_TMP_ADDR]], i8* align 8 [[RECASTED_VALUE_ADDR]], i64 16, i1 false)
 // CHECK-NEXT: add i32 {{.*}}, 16
 // CHECK-NEXT: store i32 {{.*}}, i32* {{.*}}
 // CHECK-NEXT: br label
diff --git a/test/CodeGen/x86_64-floatvectors.c b/test/CodeGen/x86_64-floatvectors.c
new file mode 100644
index 000000000000..389a06a6d6e4
--- /dev/null
+++ b/test/CodeGen/x86_64-floatvectors.c
@@ -0,0 +1,131 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | \
+// RUN:   FileCheck %s
+
+// This test validates that the inreg branch generation for __builtin_va_arg 
+// does not exceed the alloca size of the type, which can cause the SROA pass to
+// eliminate the assignment.
+
+typedef struct { float x, y, z; } vec3f;
+
+double Vec3FTest(__builtin_va_list ap) {
+  vec3f vec = __builtin_va_arg(ap, vec3f);
+  return vec.x + vec.y + vec.z;
+}
+// CHECK: define double @Vec3FTest
+// CHECK: vaarg.in_reg:
+// CHECK: [[Vec3FLoad1:%.*]] = load <2 x float>, <2 x float>*
+// CHECK: [[Vec3FGEP1:%.*]] = getelementptr inbounds { <2 x float>, float }, { <2 x float>, float }* {{%.*}}, i32 0, i32 0
+// CHECK: store <2 x float> [[Vec3FLoad1]], <2 x float>* [[Vec3FGEP1]]
+// CHECK: [[Vec3FLoad2:%.*]] = load float, float*
+// CHECK: [[Vec3FGEP2:%.*]] = getelementptr inbounds { <2 x float>, float }, { <2 x float>, float }* {{%.*}}, i32 0, i32 1
+// CHECK: store float [[Vec3FLoad2]], float* [[Vec3FGEP2]]
+// CHECK: vaarg.in_mem:
+
+
+typedef struct { float x, y, z, q; } vec4f;
+
+double Vec4FTest(__builtin_va_list ap) {
+  vec4f vec = __builtin_va_arg(ap, vec4f);
+  return vec.x + vec.y + vec.z + vec.q;
+}
+// CHECK: define double @Vec4FTest
+// CHECK: vaarg.in_reg:
+// CHECK: [[Vec4FLoad1:%.*]] = load <2 x float>, <2 x float>*
+// CHECK: [[Vec4FGEP1:%.*]] = getelementptr inbounds { <2 x float>, <2 x float> }, { <2 x float>, <2 x float> }* {{%.*}}, i32 0, i32 0
+// CHECK: store <2 x float> [[Vec4FLoad1]], <2 x float>* [[Vec4FGEP1]]
+// CHECK: [[Vec4FLoad2:%.*]] = load <2 x float>, <2 x float>*
+// CHECK: [[Vec4FGEP2:%.*]] = getelementptr inbounds { <2 x float>, <2 x float> }, { <2 x float>, <2 x float> }* {{%.*}}, i32 0, i32 1
+// CHECK: store <2 x float> [[Vec4FLoad2]], <2 x float>* [[Vec4FGEP2]]
+// CHECK: vaarg.in_mem:
+
+typedef struct { double x, y; } vec2d;
+
+double Vec2DTest(__builtin_va_list ap) {
+  vec2d vec = __builtin_va_arg(ap, vec2d);
+  return vec.x + vec.y;
+}
+// CHECK: define double @Vec2DTest
+// CHECK: vaarg.in_reg:
+// CHECK: [[Vec2DLoad1:%.*]] = load double, double*
+// CHECK: [[Vec2DGEP1:%.*]] = getelementptr inbounds { double, double }, { double, double }* {{%.*}}, i32 0, i32 0
+// CHECK: store double [[Vec2DLoad1]], double* [[Vec2DGEP1]]
+// CHECK: [[Vec2DLoad2:%.*]] = load double, double*
+// CHECK: [[Vec2DGEP2:%.*]] = getelementptr inbounds { double, double }, { double, double }* {{%.*}}, i32 0, i32 1
+// CHECK: store double [[Vec2DLoad2]], double* [[Vec2DGEP2]]
+// CHECK: vaarg.in_mem:
+
+typedef struct {
+  float x, y;
+  double z;
+} vec2f1d;
+
+double Vec2F1DTest(__builtin_va_list ap) {
+  vec2f1d vec = __builtin_va_arg(ap, vec2f1d);
+  return vec.x + vec.y + vec.z;
+}
+// CHECK: define double @Vec2F1DTest
+// CHECK: vaarg.in_reg:
+// CHECK: [[Vec2F1DLoad1:%.*]] = load <2 x float>, <2 x float>*
+// CHECK: [[Vec2F1DGEP1:%.*]] = getelementptr inbounds { <2 x float>, double }, { <2 x float>, double }* {{%.*}}, i32 0, i32 0
+// CHECK: store <2 x float> [[Vec2F1DLoad1]], <2 x float>* [[Vec2F1DGEP1]]
+// CHECK: [[Vec2F1DLoad2:%.*]] = load double, double*
+// CHECK: [[Vec2F1DGEP2:%.*]] = getelementptr inbounds { <2 x float>, double }, { <2 x float>, double }* {{%.*}}, i32 0, i32 1
+// CHECK: store double [[Vec2F1DLoad2]], double* [[Vec2F1DGEP2]]
+// CHECK: vaarg.in_mem:
+
+typedef struct {
+  double x;
+  float y, z;
+} vec1d2f;
+
+double Vec1D2FTest(__builtin_va_list ap) {
+  vec1d2f vec = __builtin_va_arg(ap, vec1d2f);
+  return vec.x + vec.y + vec.z;
+}
+// CHECK: define double @Vec1D2FTest
+// CHECK: vaarg.in_reg:
+// CHECK: [[Vec1D2FLoad1:%.*]] = load double, double*
+// CHECK: [[Vec1D2FGEP1:%.*]] = getelementptr inbounds { double, <2 x float> }, { double, <2 x float> }* {{%.*}}, i32 0, i32 0
+// CHECK: store double [[Vec1D2FLoad1]], double* [[Vec1D2FGEP1]]
+// CHECK: [[Vec1D2FLoad2:%.*]] = load <2 x float>, <2 x float>*
+// CHECK: [[Vec1D2FGEP2:%.*]] = getelementptr inbounds { double, <2 x float> }, { double, <2 x float> }* {{%.*}}, i32 0, i32 1
+// CHECK: store <2 x float> [[Vec1D2FLoad2]], <2 x float>* [[Vec1D2FGEP2]]
+// CHECK: vaarg.in_mem:
+
+typedef struct {
+  float x;
+  double z;
+} vec1f1d;
+
+double Vec1F1DTest(__builtin_va_list ap) {
+  vec1f1d vec = __builtin_va_arg(ap, vec1f1d);
+  return vec.x  + vec.z;
+}
+// CHECK: define double @Vec1F1DTest
+// CHECK: vaarg.in_reg:
+// CHECK: [[Vec1F1DLoad1:%.*]] = load float, float*
+// CHECK: [[Vec1F1DGEP1:%.*]] = getelementptr inbounds { float, double }, { float, double }* {{%.*}}, i32 0, i32 0
+// CHECK: store float [[Vec1F1DLoad1]], float* [[Vec1F1DGEP1]]
+// CHECK: [[Vec1F1DLoad2:%.*]] = load double, double*
+// CHECK: [[Vec1F1DGEP2:%.*]] = getelementptr inbounds { float, double }, { float, double }* {{%.*}}, i32 0, i32 1
+// CHECK: store double [[Vec1F1DLoad2]], double* [[Vec1F1DGEP2]]
+// CHECK: vaarg.in_mem:
+
+typedef struct {
+  double x;
+  float z;
+} vec1d1f;
+
+double Vec1D1FTest(__builtin_va_list ap) {
+  vec1d1f vec = __builtin_va_arg(ap, vec1d1f);
+  return vec.x  + vec.z;
+}
+// CHECK: define double @Vec1D1FTest
+// CHECK: vaarg.in_reg:
+// CHECK: [[Vec1D1FLoad1:%.*]] = load double, double*
+// CHECK: [[Vec1D1FGEP1:%.*]] = getelementptr inbounds { double, float }, { double, float }* {{%.*}}, i32 0, i32 0
+// CHECK: store double [[Vec1D1FLoad1]], double* [[Vec1D1FGEP1]]
+// CHECK: [[Vec1D1FLoad2:%.*]] = load float, float*
+// CHECK: [[Vec1D1FGEP2:%.*]] = getelementptr inbounds { double, float }, { double, float }* {{%.*}}, i32 0, i32 1
+// CHECK: store float [[Vec1D1FLoad2]], float* [[Vec1D1FGEP2]]
+// CHECK: vaarg.in_mem:
diff --git a/test/CodeGen/xcore-abi.c b/test/CodeGen/xcore-abi.c
index 2bac78d92edd..90306ce31a53 100644
--- a/test/CodeGen/xcore-abi.c
+++ b/test/CodeGen/xcore-abi.c
@@ -80,7 +80,7 @@ void testva (int n, ...) {
   // CHECK: store i8* [[IN]], i8** [[AP]]
   // CHECK: [[V1:%[a-z0-9]+]] = bitcast %struct.x* [[V:%[a-z0-9]+]] to i8*
   // CHECK: [[P1:%[a-z0-9]+]] = bitcast %struct.x* [[P]] to i8*
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[V1]], i8* [[P1]], i32 20, i32 4, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[V1]], i8* align 4 [[P1]], i32 20, i1 false)
   // CHECK: [[V2:%[a-z0-9]+]] = bitcast %struct.x* [[V]] to i8*
   // CHECK: call void @f(i8* [[V2]])
 
@@ -93,7 +93,7 @@ void testva (int n, ...) {
   // CHECK: store i8* [[IN]], i8** [[AP]]
   // CHECK: [[V1:%[a-z0-9]+]] = bitcast [4 x i32]* [[V0:%[a-z0-9]+]] to i8*
   // CHECK: [[P1:%[a-z0-9]+]] = bitcast [4 x i32]* [[P]] to i8*
-  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[V1]], i8* [[P1]], i32 16, i32 4, i1 false)
+  // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[V1]], i8* align 4 [[P1]], i32 16, i1 false)
   // CHECK: [[V2:%[a-z0-9]+]] = getelementptr inbounds [4 x i32], [4 x i32]* [[V0]], i32 0, i32 0
   // CHECK: store i32* [[V2]], i32** [[V:%[a-z0-9]+]], align 4
   // CHECK: [[V3:%[a-z0-9]+]] = load i32*, i32** [[V]], align 4
diff --git a/test/CodeGen/xray-always-emit-typedevent.cpp b/test/CodeGen/xray-always-emit-typedevent.cpp
new file mode 100644
index 000000000000..50593dd20322
--- /dev/null
+++ b/test/CodeGen/xray-always-emit-typedevent.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -fxray-instrument -fxray-always-emit-typedevents -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck %s
+
+// CHECK-LABEL: @_Z15neverInstrumentv
+[[clang::xray_never_instrument]] void neverInstrument() {
+  static constexpr char kPhase[] = "never";
+  __xray_typedevent(1, kPhase, 5);
+  // CHECK: call void @llvm.xray.typedevent(i16 {{.*}}, i8*{{.*}}, i32 5)
+}
diff --git a/test/CodeGen/xray-always-instrument.cpp b/test/CodeGen/xray-always-instrument.cpp
index 60d859569958..fb6690577f58 100644
--- a/test/CodeGen/xray-always-instrument.cpp
+++ b/test/CodeGen/xray-always-instrument.cpp
@@ -1,6 +1,14 @@
 // RUN: echo "fun:*foo*" > %t.always-instrument
 // RUN: echo "src:*xray-always-instrument.cpp" >> %t.always-instrument
-// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 -fxray-always-instrument=%t.always-instrument -emit-llvm -o - %s -triple x86_64-unknown-linux-gnu | FileCheck %s
+// RUN: echo "[always]" > %t.xray-attrlist
+// RUN: echo "fun:*foo*" >> %t.xray-attrlist
+// RUN: echo "src:*xray-always-instrument.cpp" >> %t.xray-attrlist
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 \
+// RUN:     -fxray-always-instrument=%t.always-instrument -emit-llvm -o - %s \
+// RUN:     -triple x86_64-unknown-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 \
+// RUN:     -fxray-attr-list=%t.xray-attrlist -emit-llvm -o - %s \
+// RUN:     -triple x86_64-unknown-linux-gnu | FileCheck %s
 
 void foo() {}
 
diff --git a/test/CodeGen/xray-attr-list.cpp b/test/CodeGen/xray-attr-list.cpp
new file mode 100644
index 000000000000..f2e48773e41a
--- /dev/null
+++ b/test/CodeGen/xray-attr-list.cpp
@@ -0,0 +1,19 @@
+// RUN: echo "[always]" > %t.xray-attrlist
+// RUN: echo "fun:*always*" >> %t.xray-attrlist
+// RUN: echo "[never]" >> %t.xray-attrlist
+// RUN: echo "fun:*never*" >> %t.xray-attrlist
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 \
+// RUN:     -fxray-attr-list=%t.xray-attrlist -emit-llvm -o - %s \
+// RUN:     -triple x86_64-unknown-linux-gnu | FileCheck %s
+
+void always() {}
+void never() {}
+[[clang::xray_never_instrument]] void alwaysNever() {}
+[[clang::xray_always_instrument]] void neverAlways() {}
+
+// CHECK: define void @_Z6alwaysv() #[[ALWAYSATTR:[0-9]+]] {
+// CHECK: define void @_Z5neverv() #[[NEVERATTR:[0-9]+]] {
+// CHECK: define void @_Z11alwaysNeverv() #[[NEVERATTR]] {
+// CHECK: define void @_Z11neverAlwaysv() #[[ALWAYSATTR]] {
+// CHECK: attributes #[[ALWAYSATTR]] = {{.*}} "function-instrument"="xray-always" {{.*}}
+// CHECK: attributes #[[NEVERATTR]] = {{.*}} "function-instrument"="xray-never" {{.*}}
diff --git a/test/CodeGen/xray-imbue-arg1.cpp b/test/CodeGen/xray-imbue-arg1.cpp
index eb272b97eafb..083099ce582d 100644
--- a/test/CodeGen/xray-imbue-arg1.cpp
+++ b/test/CodeGen/xray-imbue-arg1.cpp
@@ -1,5 +1,12 @@
 // RUN: echo "fun:*arg1*=arg1" >> %t.always-instrument
-// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 -fxray-always-instrument=%t.always-instrument -emit-llvm -o - %s -triple x86_64-unknown-linux-gnu | FileCheck %s
+// RUN: echo "[always]" > %t.xray-attrlist
+// RUN: echo "fun:*arg1*=arg1" >> %t.xray-attrlist
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 \
+// RUN:     -fxray-always-instrument=%t.always-instrument -emit-llvm -o - %s \
+// RUN:     -triple x86_64-unknown-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 \
+// RUN:     -fxray-attr-list=%t.xray-attrlist -emit-llvm -o - %s \
+// RUN:     -triple x86_64-unknown-linux-gnu | FileCheck %s
 
 void foo() {}
 
diff --git a/test/CodeGen/xray-instrumentation-bundles.cpp b/test/CodeGen/xray-instrumentation-bundles.cpp
new file mode 100644
index 000000000000..79286c2c8aed
--- /dev/null
+++ b/test/CodeGen/xray-instrumentation-bundles.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -fxray-instrument -fxray-instrumentation-bundle=none -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,NOFUNCTION,NOCUSTOM,NOTYPED %s
+// RUN: %clang_cc1 -fxray-instrument -fxray-instrumentation-bundle=function -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,FUNCTION,NOCUSTOM,NOTYPED %s
+// RUN: %clang_cc1 -fxray-instrument \
+// RUN:     -fxray-instrumentation-bundle=custom -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,NOFUNCTION,CUSTOM,NOTYPED %s
+// RUN: %clang_cc1 -fxray-instrument \
+// RUN:     -fxray-instrumentation-bundle=typed -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,NOFUNCTION,NOCUSTOM,TYPED %s
+// RUN: %clang_cc1 -fxray-instrument \
+// RUN:     -fxray-instrumentation-bundle=custom,typed -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,NOFUNCTION,CUSTOM,TYPED %s
+// RUN: %clang_cc1 -fxray-instrument \
+// RUN:     -fxray-instrumentation-bundle=function,custom -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,FUNCTION,CUSTOM,NOTYPED %s
+// RUN: %clang_cc1 -fxray-instrument \
+// RUN:     -fxray-instrumentation-bundle=function,typed -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,FUNCTION,NOCUSTOM,TYPED %s
+// RUN: %clang_cc1 -fxray-instrument \
+// RUN:     -fxray-instrumentation-bundle=function,custom,typed -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,FUNCTION,CUSTOM,TYPED %s
+// RUN: %clang_cc1 -fxray-instrument \
+// RUN:     -fxray-instrumentation-bundle=function \
+// RUN:     -fxray-instrumentation-bundle=custom \
+// RUN:     -fxray-instrumentation-bundle=typed -x c++ \
+// RUN:     -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
+// RUN:     | FileCheck --check-prefixes CHECK,FUNCTION,CUSTOM,TYPED %s
+
+// CHECK: define void @_Z16alwaysInstrumentv() #[[ALWAYSATTR:[0-9]+]] {
+[[clang::xray_always_instrument]] void alwaysInstrument() {
+  static constexpr char kPhase[] = "always";
+  __xray_customevent(kPhase, 6);
+  __xray_typedevent(1, kPhase, 6);
+  // CUSTOM: call void @llvm.xray.customevent(i8*{{.*}}, i32 6)
+  // NOCUSTOM-NOT: call void @llvm.xray.customevent(i8*{{.*}}, i32 6)
+  // TYPED: call void @llvm.xray.typedevent(i16 {{.*}}, i8*{{.*}}, i32 6)
+  // NOTYPED-NOT: call void @llvm.xray.typedevent(i16 {{.*}}, i8*{{.*}}, i32 6)
+}
+
+// FUNCTION: attributes #[[ALWAYSATTR]] = {{.*}} "function-instrument"="xray-always" {{.*}}
+// NOFUNCTION-NOT: attributes #[[ALWAYSATTR]] = {{.*}} "function-instrument"="xray-always" {{.*}}
diff --git a/test/CodeGen/xray-never-instrument.cpp b/test/CodeGen/xray-never-instrument.cpp
new file mode 100644
index 000000000000..4b20edc6ad60
--- /dev/null
+++ b/test/CodeGen/xray-never-instrument.cpp
@@ -0,0 +1,24 @@
+// RUN: echo "fun:*foo*" > %t.never-instrument
+// RUN: echo "src:*xray-never-instrument.cpp" >> %t.never-instrument
+// RUN: echo "[never]" > %t.xray-attrlist
+// RUN: echo "fun:*foo*" >> %t.xray-attrlist
+// RUN: echo "src:*xray-never-instrument.cpp" >> %t.xray-attrlist
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 \
+// RUN:     -fxray-never-instrument=%t.never-instrument -emit-llvm -o - %s \
+// RUN:     -triple x86_64-unknown-linux-gnu | FileCheck %s
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 \
+// RUN:     -fxray-attr-list=%t.xray-attrlist -emit-llvm -o - %s \
+// RUN:     -triple x86_64-unknown-linux-gnu | FileCheck %s
+
+void foo() {}
+
+[[clang::xray_always_instrument]] void bar() {}
+
+void baz() {}
+
+// CHECK: define void @_Z3foov() #[[NEVERATTR:[0-9]+]] {
+// CHECK: define void @_Z3barv() #[[ALWAYSATTR:[0-9]+]] {
+// CHECK: define void @_Z3bazv() #[[NEVERATTR:[0-9]+]] {
+// CHECK: attributes #[[NEVERATTR]] = {{.*}} "function-instrument"="xray-never" {{.*}}
+// CHECK: attributes #[[ALWAYSATTR]] = {{.*}} "function-instrument"="xray-always" {{.*}}
+
diff --git a/test/CodeGen/xray-typedevent.cpp b/test/CodeGen/xray-typedevent.cpp
new file mode 100644
index 000000000000..e804b09dc2c4
--- /dev/null
+++ b/test/CodeGen/xray-typedevent.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -fxray-instrument -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @_Z16alwaysInstrumentv
+[[clang::xray_always_instrument]] void alwaysInstrument() {
+  // Event types would normally come from calling __xray_register_event_type
+  // from compiler-rt
+  auto EventType = 1;
+  static constexpr char kPhase[] = "instrument";
+  __xray_typedevent(EventType, kPhase, 10);
+  // CHECK: call void @llvm.xray.typedevent(i16 {{.*}}, i8*{{.*}}, i32 10)
+}
+
+// CHECK-LABEL: @_Z15neverInstrumentv
+[[clang::xray_never_instrument]] void neverInstrument() {
+  auto EventType = 2;
+  static constexpr char kPhase[] = "never";
+  __xray_typedevent(EventType, kPhase, 5);
+  // CHECK-NOT: call void @llvm.xray.typedevent(i16 {{.*}}, i8*{{.*}}, i32 5)
+}
+
+// CHECK-LABEL: @_Z21conditionalInstrumenti
+[[clang::xray_always_instrument]] void conditionalInstrument(int v) {
+  auto TrueEventType = 3;
+  auto UntrueEventType = 4;
+  static constexpr char kTrue[] = "true";
+  static constexpr char kUntrue[] = "untrue";
+  if (v % 2)
+    __xray_typedevent(TrueEventType, kTrue, 4);
+  else
+    __xray_typedevent(UntrueEventType, kUntrue, 6);
+
+  // CHECK: call void @llvm.xray.typedevent(i16 {{.*}}, i8*{{.*}}, i32 4)
+  // CHECK: call void @llvm.xray.typedevent(i16 {{.*}}, i8*{{.*}}, i32 6)
+}